爬虫基础爬取糗事百科内容和标题

最新推荐文章于 2021-02-12 11:47:32 发布

青云--

最新推荐文章于 2021-02-12 11:47:32 发布

阅读量1.7k

点赞数

文章标签：爬虫 python

本文链接：https://blog.csdn.net/yinjun3215/article/details/108338728

版权

import re
import time
from urllib import request
# 作业2: 爬取糗事百科文本页的所有段子,结果如 : xx说: xxxx
# https://www.qiushibaike.com/text/page/1/   # 1表示页码

# 正则表达式提示： 
#	# 获取一个评论
#   regCom = re.compile('<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">', re.S)
#	# 获取名称
#   nameCom = re.compile('<h2>(.*?)</h2>', re.S)
#	# 获取内容
#	contentCom = re.compile('<span>(.*?)</span>', re.S)

headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
}


def getData(url):
    #构建请求对象
    req = request.Request(url,headers=headers)
    response = request.urlopen(req)
    html = response.read().decode()

    regCom = re.compile('<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">', re.S)

    comment_list = regCom.findall(html)  #返回的是一个列表
    # print(comment_list)
    item_list = []
    for comment in comment_list:
        nameCom = re.compile('<h2>(.*?)</h2>', re.S)
        name = nameCom.findall(comment)[0].strip()
        # print(name)
        contentCom = re.compile('<span>(.*?)</span>', re.S)
        content = contentCom.findall(comment)[0].strip()
        # print(content)
        item_list.append({'name':name,'content':content})

    return item_list


if __name__ == "__main__":

    # 所有数据
    allData = []
    # [{name1:zh, content:22},{name1:zh, content:22},{name1:zh, content:22},{name1:zh, content:22},...]

    # 遍历每一页的数据
    for i in range(1, 10):
        url = "https://www.qiushibaike.com/text/page/" + str(i) + "/"
        list1 = getData(url)
        # print(list1)
        allData.extend(list1)

        time.sleep(0.5)


    # 遍历allData 把数据显示
    for dict1 in allData:
        print("%s 说： %s" % (dict1["name"], dict1["content"]))

千山万水总是情，点个关注行不行。