import urllib.request
import ssl
import re
def ajaxCrawler(url):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
req = urllib.request.Request(url,headers=headers)
#使用ssl创建未验证的上下文
context = ssl._create_unverified_context()
response = urllib.request.urlopen(req,context=context)
jsonStr = response.read().decode("utf-8")
return jsonStr
url = "https://www.qiushibaike.com/text/page/1/" #然后循环爬取page/2/ 、、、
#filePath = "qiushi.html"
par1 = r'''article block untagged mb15(.*?)class="stats-comments'''
re_ob = re.compile(par1,re.S)
listStr = re_ob.findall(ajaxCrawler(url))
jsonStr ={}
for ss in listStr:
re_Content = re.compile(r'''class="content".*?<span>(.*?)</span>''',re.S) #前期不要写的太严格,防止有的匹配不到
userContent = re_Content.findall(ss)[0] #返回的是一个数组,取第一个
re_name = re.compile(r'''<h2>(.*?)</h2>''',re.S)
userName = re_name.findall(ss)[0]
jsonStr[userName] = userContent
for k,v in jsonStr.items():
print(k+":说"+v)