程序演示如下:
import requests
import re
def parse_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
response = requests.get(url , headers = headers)
text = response.text
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>' ,text , re.DOTALL) #re.DOTALL让.可以匹配换行符\n
authors = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text , re.DOTALL)
dynasties = re.findall(r'<p class="source".*?<a.*?>.*?<a.*?>(.*?)</a>', text ,re.DOTALL) #因为朝代在p标签下的第二个a标签,所以要用两个写两个a标签
contents_tags = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
contents