python爬虫,嗅事百科(段子爬取)

该代码实现了一个简单的Python爬虫,用于抓取糗事百科网站上的热门段子。它首先定义了请求头并设置了初始URL,然后通过get_data方法获取网页内容,使用lxml库解析HTML并提取段子的内容、作者、点赞数和评论数。数据被保存到一个json文件中,爬虫会不断抓取下一页直到没有更多链接。
摘要由CSDN通过智能技术生成

coding=utf-8

“”"
author:lei
function:
“”"

import requests
from lxml import etree
import json

class QiuShi(object):

def __init__(self):
    self.url = "https://www.qiushibaike.com/text/"
    self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36", "Referer": "https://www.qiushibaike.com/"}

def get_data(self, url):
    return requests.get(url).content.decode()

def get_content(self, url):
    response = self.get_data(url)
    html = etree.HTML(response)

    content = html.xpath("//div[@class='content']/text()")

    return content

def parse_data(self, response):
    html = etree.HTML(response)

    el_list = html.xpath("//div[@class='article block untagged mb15 typs_hot']")
    print(len(el_list))

    temp_list = []
    for el in el_list:
        temp = {}
        temp["author"] = el.xpath("./div[1]/a[2]/h2/text()")[0].strip("\n")
        temp["href"] = "https://www.qiushibaike.com" + el.xpath("./a[1]/@href")[0]
        temp["content"] = self.get_content(temp["href"])
        temp["smile_num"] = el.xpath("./div[2]/span[1]/i/text()")[0]
        temp["comment_num"] = el.xpath("./div[2]/span[2]/a/i/text()")[0]
        # print(temp)
        temp_list.append(temp)

    try:
        next_url = "https://www.qiushibaike.com" + html.xpath("//span[@class='next']/../../a/@href")[0]
    except:
        next_url = None

    return temp_list, next_url

def save_data(self, temp_list):
    with open("qiushi.json", "a", encoding="utf-8") as f:
        f.write(json.dumps(temp_list, ensure_ascii=False))
        print("保存成功!")
    print(temp_list)

def run(self):
    next_url = self.url

    while next_url:
        response = self.get_data(next_url)
        temp_list, next_url = self.parse_data(response)
        self.save_data(temp_list)

if name == ‘main’:
qiushi = QiuShi()
qiushi.run()

评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值