python 爬虫（五）爬取多页内容

最新推荐文章于 2024-07-24 17:16:09 发布

sun-zhu

最新推荐文章于 2024-07-24 17:16:09 发布

阅读量5.8k

点赞数 2

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_40938748/article/details/85310881

版权

python 专栏收录该内容

24 篇文章 0 订阅

订阅专栏

import urllib.request
import ssl
import re

def ajaxCrawler(url):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    req = urllib.request.Request(url,headers=headers)

    #使用ssl创建未验证的上下文
    context = ssl._create_unverified_context()

    response = urllib.request.urlopen(req,context=context)
    jsonStr = response.read().decode("utf-8")

    return jsonStr

url = "https://www.qiushibaike.com/text/page/1/" #然后循环爬取page/2/ 、、、
#filePath = "qiushi.html"
par1 = r'''article block untagged mb15(.*?)class="stats-comments'''
re_ob = re.compile(par1,re.S)
listStr = re_ob.findall(ajaxCrawler(url))

jsonStr ={}

for ss in listStr:
    re_Content = re.compile(r'''class="content".*?<span>(.*?)</span>''',re.S)  #前期不要写的太严格，防止有的匹配不到
    userContent = re_Content.findall(ss)[0] #返回的是一个数组，取第一个

    re_name = re.compile(r'''<h2>(.*?)</h2>''',re.S)
    userName = re_name.findall(ss)[0]

    jsonStr[userName] = userContent
for k,v in jsonStr.items():
    print(k+"：说"+v)