爬虫:爬取糗事百科数据

在这里插入图片描述

import requests
from lxml import etree
from fake_useragent import UserAgent


class Qiu:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_max_page()

    def get_html(self, base_url):
        # 随机产生一个浏览器信息
        headers = {"User-Agent": UserAgent().random}
        response = requests.get(base_url, headers=headers)
        html = response.text
        # print(html)
        html_xml = etree.HTML(html)
        return html_xml


    def get_max_page(self):
        base_url = "https://www.qiushibaike.com/8hr/page/2/"
        html_xml = self.get_html(base_url)
        # 获取最大页码
        max_page = int(html_xml.xpath("//a/span[@class='page-numbers']/text()")[-1].strip())
        # print(max_page)
        self.get_data(max_page)

    def get_data(self, max_page):

        for page in range(1, max_page + 1):
            print("===================第{}页开始下载=========================".format(page))
            page_url = "https://www.qiushibaike.com/8hr/page/{}/".format(page)
            # print(page_url)
            html_xml = self.get_html(page_url)
            # 缩小范围
            li_list = html_xml.xpath("//li[contains(@id, 'qiushi_tag_')]")
            # print(len(li_list))

            for li in li_list:
                # 获取图片
                pic = li.xpath(".//a[contains(@class, 'recmd-left')]/img/@src")[0]
                # if "/w/150/h/112" in pic:
                #     pic = "https:" + pic[:-12]
                # else:
                #     pic = ""

                # 三元表达式 实现上面的代码
                pic = "https:" + pic[:-12] if "/w/150/h/112" in pic else ""
                # print(pic)

                # 获取昵称
                nike_name = li.xpath(".//span[@class='recmd-name']/text()")[0]
                # print(nike_name)

                # 获取内容
                content = li.xpath(".//a[@class='recmd-content']/text()")
                content = content[0] if content else ""
                # print(content)

                # 获取好笑数量
                laught_num = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0]
                # if "万" in laught_num:
                #     laught_num = int(float(laught_num[:-1]) * 10000)
                # else:
                #     laught_num = int(laught_num)
                laught_num = int(float(laught_num[:-1]) * 10000) if "万" in laught_num else int(laught_num)
                # print(laught_num)

                # 评论数量
                comment_num = li.xpath(".//div[@class='recmd-num']/span[4]/text()")
                comment_num = int(comment_num[0]) if comment_num else 0
                # print(comment_num)

                qiu_dict = {
                    "pic": pic,
                    "nike_name": nike_name,
                    "content": content,
                    "laught_num": laught_num,
                    "comment_num": comment_num,

                }
                print(self.count, qiu_dict)
                self.count += 1


if __name__ == '__main__':
    qiu = Qiu()
    qiu()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小小争520

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值