爬虫:爬取糗事百科数据

最新推荐文章于 2020-05-08 15:02:12 发布

小小争520

最新推荐文章于 2020-05-08 15:02:12 发布

阅读量1.3k

点赞数 1

文章标签：爬虫 Python 糗事百科

本文链接：https://blog.csdn.net/weixin_44675384/article/details/90633013

版权

在这里插入图片描述

import requests
from lxml import etree
from fake_useragent import UserAgent


class Qiu:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_max_page()

    def get_html(self, base_url):
        # 随机产生一个浏览器信息
        headers = {"User-Agent": UserAgent().random}
        response = requests.get(base_url, headers=headers)
        html = response.text
        # print(html)
        html_xml = etree.HTML(html)
        return html_xml


    def get_max_page(self):
        base_url = "https://www.qiushibaike.com/8hr/page/2/"
        html_xml = self.get_html(base_url)
        # 获取最大页码
        max_page = int(html_xml.xpath("//a/span[@class='page-numbers']/text()")[-1].strip())
        # print(max_page)
        self.get_data(max_page)

    def get_data(self, max_page):

        for page in range(1, max_page + 1):
            print("===================第{}页开始下载=========================".format(page))
            page_url = "https://www.qiushibaike.com/8hr/page/{}/".format(page)
            # print(page_url)
            html_xml = self.get_html(page_url)
            # 缩小范围
            li_list = html_xml.xpath("//li[contains(@id, 'qiushi_tag_')]")
            # print(len(li_list))

            for li in li_list:
                # 获取图片
                pic = li.xpath(".//a[contains(@class, 'recmd-left')]/img/@src")[0]
                # if "/w/150/h/112" in pic:
                #     pic = "https:" + pic[:-12]
                # else:
                #     pic = ""

                # 三元表达式 实现上面的代码
                pic = "https:" + pic[:-12] if "/w/150/h/112" in pic else ""
                # print(pic)

                # 获取昵称
                nike_name = li.xpath(".//span[@class='recmd-name']/text()")[0]
                # print(nike_name)

                # 获取内容
                content = li.xpath(".//a[@class='recmd-content']/text()")
                content = content[0] if content else ""
                # print(content)

                # 获取好笑数量
                laught_num = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0]
                # if "万" in laught_num:
                #     laught_num = int(float(laught_num[:-1]) * 10000)
                # else:
                #     laught_num = int(laught_num)
                laught_num = int(float(laught_num[:-1]) * 10000) if "万" in laught_num else int(laught_num)
                # print(laught_num)

                # 评论数量
                comment_num = li.xpath(".//div[@class='recmd-num']/span[4]/text()")
                comment_num = int(comment_num[0]) if comment_num else 0
                # print(comment_num)

                qiu_dict = {
                    "pic": pic,
                    "nike_name": nike_name,
                    "content": content,
                    "laught_num": laught_num,
                    "comment_num": comment_num,

                }
                print(self.count, qiu_dict)
                self.count += 1


if __name__ == '__main__':
    qiu = Qiu()
    qiu()