基础
import re
ret = re.match("rule", "rule.com.qq")
result = ret.group()
print(result)
print(re.match("t.o", 'two').group())
"""
[] 匹配[] 内列举的字符
\d 匹配数字
\D 匹配非数字
\s 匹配空白——空格,tab键
\S 匹配非空白
\w 匹配非特殊字符——a-z,A-Z,0-9,_,汉字
\W 匹配特殊字符,非字母,非数字,非汉字
\ 转移字符
. 匹配任意一个字符
* 匹配任意数量的表达式
+ 匹配一个或多个
? 将+,*,{}改成非贪婪模式,取尽可能少的匹配字串
^ 匹配字符串开头
$ 匹配字符串结尾
{min,max} 重复min到max次前面表达式
| 匹配左右任意一个表达式(或)
() 截取正则表达式中()指定内容
\num 引用分组num匹配到的字符串
data = '<html><div>hello</div></html>'
result = re.match("<([a-zA-z0-9]{1,30})><([a-zA-z0-9]{1,30})>.*</\\2></\\1>",data).group()
(?P<name>) 分组起别名
data = '<html><div>hello</div></html>'
result = re.match("<(?P<name1>[a-zA-z0-9]{1,30})><(?P<name2>[a-zA-z0-9]{1,30})>.*</(?P=name2)></(?P=name1)>",data).group()
"""
try:
data = '<html><div>hello</div></html>'
result = re.match("<(?P<name1>[a-zA-z0-9]{1,30})><(?P<name2>[a-zA-z0-9]{1,30})>.*</(?P=name2)></(?P=name1)>",
data).group()
'''
group() 把所有匹配的数据都获取
group(n) 获取匹配的第n个数据
匹配不到就报错
'''
except BaseException as error:
print("匹配失败")
else:
print(result)
# re模块高级用法
# search 搜索匹配
# match 从开头位置匹配,如果失败返回None
# search 搜索满足的正则的内容
result = re.search("gchen", "xxxx_gchencode@126com")
print(result.span(), result)
# findall 搜索全部,返回列表
result = re.findall("\d+", "阅读次数:9999,转发次数:6666")
print(result)
# sub 字符串替换
result = re.sub("\d+", "8888", "阅读次数:9999,转发次数:6666")
print(result)
案例:正则实现爬电影天堂磁力链
import re
import urllib.request
def getFilmDict():
web_data = urllib.request.urlopen("https://dy.dytt8.net/html/gndy/dyzz/index.html")
film_data = web_data.read()
film_text = film_data.decode("GBK")
url_list = re.findall(r"<a href=\"(.*)\" class=\"ulink\">(.*)</a>", film_text)
film_dict = {}
i = 0
for film_web_url, film_name in url_list:
film_web_url = "https://dy.dytt8.net/" + film_web_url
film_web_data = urllib.request.urlopen(film_web_url)
film_main_data = film_web_data.read()
film_main_text = film_main_data.decode("GBK")
film_upload_url = re.search(r"<a( target=\"_blank\")? href=\"(magnet(.*))\">", film_main_text)
print(f"获取了{i}电影,{film_name},{film_upload_url == None}")
if film_upload_url != None:
film_dict[film_name] = film_upload_url.group(2)
i += 1
return film_dict
if __name__ == '__main__':
film_dict = getFilmDict()
for film_name, film_upload_url in film_dict.items():
print(f"最新电影{film_name}的下载地址是{film_upload_url}")