【从零开始搞爬爬】贴吧抓数据

本文介绍了一个使用Python编写的脚本,通过requests和lxml库抓取百度贴吧中关于关键词李毅的帖子标题、链接及分页信息,保存到txt文件中。
摘要由CSDN通过智能技术生成

 

import requests
from lxml import etree

class tieba(object):
    def __init__(self,keyword):
        self.keyword = keyword
        self.url = "https://tieba.baidu.com/f?ie=utf-8&kw={}".format(keyword)
        self.headers = {
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
        }
        print(self.url)
        self.run()

    def get_data(self,url):
        # 获取数据
        response = requests.get(url, headers=self.headers)
        return response.content

    def parse_list_data(self,data):
        # 实际贴吧网页会有注释,导致拿不到数据
        data = data.decode().replace('<!--','').replace('-->','')
        # 数据转为element对象
        html = etree.HTML(data)
        ele_list = html.xpath('//div[contains(@class,"threadlist_title")]/a[@rel="noopener"]')
        print(len(ele_list))
        data_list = []
        # 获取文章数据
        for ele in ele_list:
            temp_dict = {}
            temp_dict['title'] = ele.xpath('./text()')[0]
            temp_dict['link'] = 'https://tieba.baidu.com' + ele.xpath('./@href')[0]
            data_list.append(temp_dict)

        # 获取下一页链接,如若没有下一页链接则返回None
        try:
            next_page_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
        except:
            next_page_url = None

        # 返回数据和链接
        return data_list,next_page_url

    def run(self):
        next_page_url = self.url
        while next_page_url:
            page_list_data = self.get_data(next_page_url)
            temp_page_url = next_page_url
            data_list,next_page_url = self.parse_list_data(page_list_data)
            print(next_page_url)
            for data in data_list:
                with open('tieba.txt','a',encoding='utf8') as f:
                    str = ('title:'+ data['title'] + "\t" + 'link:' + data['link'] + "\t" + 'page_url:' + temp_page_url + "\n")
                    f.write(str)

if __name__ == '__main__':
    tieba = tieba("李毅")
    tieba.run()

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值