self_example/Spider/chapter02_爬虫基本库/re库_正则匹配/reLearning.py

180 lines
5.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/7 17:31
@Usage :
@Desc : 正则匹配re库基本使用
'''
import re
'''
基本使用
'''
def baseUse():
content = 'hello 123456789 World_This is a Reges Demo'
pattern = '^hello\s(\d+)\sWorld'
result = re.match(pattern, content)
print(result) # <re.Match object; span=(0, 21), match='hello 123456789 World'>
print(result.group()) # hello 123456789 World 输出匹配到的内容
print(result.group(1)) # 123456789 输出第一个被()包围的匹配结果
print(result.span()) # (0, 21) 输出匹配的范围
# 高级用法
'''
贪婪匹配与非贪婪匹配
.*表示尽可能多匹配字符
.*?表示尽可能少匹配字符
'''
'''
re.I 表示匹配对大小写不明感
re.L 实现本地化识别匹配
re.M 表示多航匹配,影响^和$
re.S 表示匹配内容包括换行符在内的所有字符
re.U 表示根据Unicode解析字符这个表示会影响\w,\W,\b\B
re.S 表示匹配内容包括换行符在内的所有字符
'''
def prior():
content = '''hello 123456789 World_This
is a Reges Demo
'''
result = re.match('^he.*?(\d+).*?Demo$', content)
print(result.group(1)) # 未匹配到,报错 AttributeError: 'NoneType' object has no attribute 'group'
result = re.match('^he.*?(\d+).*?Demo$', content, re.S)
print(result.group(1)) # 123456789
'''
search:模糊匹配
'''
def search():
content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'
result = re.match('Hello.*?(\d+).*?Demo', content) # 必须要以Hello开头才能匹配到
print(result) # None
result = re.search('Hello.*?(\d+).*?Demo', content)
print(result) # <re.Match object; span=(13, 53), match='Hello 1234567 World_This is a Regex Demo'>
def searchHtml():
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
result = re.search('<li.*?active.*?singer="(.*?)">(.*?)</a>', html, re.S)
if result:
print(result.group(1), result.group(2))
result = re.search('<li.*?singer="(.*?)">(.*?)</a>', html, re.S)
if result:
print(result.group(1), result.group(2))
result = re.search('<li.*?singer="(.*?)">(.*?)</a>', html)
if result:
print(result.group(1), result.group(2))
'''
findAll:找到所有匹配的
'''
def findall():
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
results = re.findall('<li.*?href="(.*?)".*?singer="(.*?)">(.*?)</a>', html, re.S)
print(results)
print(type(results))
for result in results:
print(result)
print(result[0], result[1], result[2])
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>', html, re.S)
for result in results:
print(result[1])
'''
sub:正则匹配修改文本
使用正则匹配,去除掉能匹配上的内容
'''
def sub():
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
html = re.sub('<a.*?>|</a>', '', html)
print(html)
results = re.findall('<li.*?>(.*?)</li>', html, re.S)
for result in results:
print(result.strip())
'''
compile:编译
将相关模式编译成pattern对象进行复用
'''
def complie():
content1 = '2019-12-15 12:00'
content2 = '2019-12-17 12:55'
content3 = '2019-12-22 13:21'
pattern = re.compile('\d{2}:\d{2}')
result1 = re.sub(pattern, '', content1)
result2 = re.sub(pattern, '', content2)
result3 = re.sub(pattern, '', content3)
print(result1, result2, result3)
if __name__ == '__main__':
sub()