使用xpath爬取糗事百科的数据

使用xpath爬取糗事百科的数据

 


from lxml import etree
#提取页面数据
html_data= etree.HTML(html)
list_data= html_data.xpath('.//div[@id="content-left"]/div')

 代码详情

# -*- coding:utf-8 -*-
import requests
from lxml import etree



class Spider_QSBK(object):
    '''糗事百科爬虫'''
    def __init__(self):
        '''初始化设置请求头和请求地址'''
        self.headers ={
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
        }
        self.start_url = 'https://www.qiushibaike.com/text/page/{}/'


    def get_url(self):
        '''构建url列表'''
        self.list_url = [self.start_url.format(i) for i in range(1,2)]


    def send_request(self):
        '发送请求,获取页面内容'
        for url in self.list_url:
            response = requests.get(url,self.headers)
            html = response.content.decode()
            self.get_data(html)


    def get_data(self,html):
        #提取页面数据
        html_data= etree.HTML(html)

        list_data= html_data.xpath('.//div[@id="content-left"]/div')

        for data in list_data:
            author= data.xpath('./div/a/h2/text()')[0].strip() if len(data.xpath('./div/a/h2/text()'))>0 else None
            content = data.xpath('./a/div/span/text()')[0].strip() if len(data.xpath('./a/div/span/text()')[0].strip()) else None
            stats = data.xpath('./div/span/i/text()')[0]
            count = data.xpath('./div/span/a/i/text()')[0]
            data_dict = dict(
                author =author,
                content=content,
                stats=stats,
                count=count
            )
            self.save_data(data_dict)

    def save_data(self,data_dict):
        # 保存数据
        with open('qiubai.txt','a',encoding='utf8')as f :
            f.write(str(data_dict)+'\n')


    def run(self):
        # 构建url列表
        self.get_url()
        #发送请求
        self.send_request()


if __name__ == '__main__':
    qiushi = Spider_QSBK()
    qiushi.run()

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值