import re
import requests
from fake_useragent import UserAgent
url = 'https://www.gushiwen.cn/shiju/xiejing.aspx'
#随机请求头
headers = {
'user-agent':UserAgent().random
}
#构造请求
response = requests.get(url,headers=headers)
info = response.text
# # print(info)
# infos = re.findall(r'<div class="cont">.*?<span .*?>(.*?)</span><a .*?>(.*?)</a>', info, re.DOTALL)
# print(infos)
# print(info)
infos = re.findall(r'<div .*cont', info, re.DOTALL)
contents = []
for m in infos:
num = re.findall(r'\s<span .*?>(.*?)</span>',info,re.DOTALL)
txt = re.findall(r'</span><a .*?>(.*?)</a>.*?',info,re.DOTALL)
contents.append(m)
poems = []
for value in zip(num,txt):
num, txt = value
poems.append(
{
'num': num,
'txt': txt,
}
)
print(poems)
'''
<div class="cont">.*?<span .*?>(.*?)</span><a .*?>(.*?)</a>.*?<span .*?>(.*?)</span><a .*?>(.*?)</a>'
'''
#'<a .*?>(.*?)</a>'
#'<a .*?>(.*?)</a>'
分析页面
总结
学习了一下正则一些简单语法
对于一些获取的文本正则是不错但是较难编写