(27)爬虫类方法综合实例

import requests
from fake_useragent import UserAgent
from lxml import etree
#url管理
class URLManger(object):
    def __init__(self):
        self.new_url=[]
        self.old_url=[]

    def get_new_url(self):
        url = self.new_url.pop()
        self.old_url.append(url)
        return url

    def add_new_url(self,url):
        if url not in self.new_url and url and url not in self.old_url:
            self.new_url.append(url)

    def add_new_urls(self,urls):
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):
        return self.get_new_url_size() > 0

    def get_new_url_size(self):
        return len(self.new_url)

    def get_old_url_size(self):
        return len(self.old_url)

#下载

class Downloader:

    def download(self,url):
        response = requests.get(url,headers = {'User-Agent':UserAgent().random})
        if response.status_code==200:
            response.encoding='utf-8'
            return response.text
        else:
            return None


#解析

class Parser:
    def parse(self,html):
        e = etree.HTML(html)
        data_s = self.parse_info(e)
        urls = self.parse_url(e)
        return data_s,urls
    def parse_info(self,e):
        spans = e.xpath('''//div[@class='content']/span''')
        data = []
        for span in spans:
            data.append(span.xpath('string(.)'))
        return data
    def parse_url(self,e):
        url_s = []
        base_url='https://www.qiushibaike.com{}'
        for url in e.xpath('''//ul[@class='pagination']/li/a/@href'''):
            url_s.append(base_url.format(url))
        return url_s

#数据处理
class DataOutput:
    def save(self,data_s):
        with open('duanzi.txt','a',encoding='utf-8') as f:
            for data in data_s:
                f.write(data)



#调度
class DiaoDu:
    def __init__(self):
        self.downloader=Downloader()
        self.url_manger = URLManger()
        self.paeser= Parser()
        self.data_saver = DataOutput()
    def run(self,url):
        self.url_manger.add_new_url(url)
        while self.url_manger.has_new_url():
            url=self.url_manger.get_new_url()
            html=self.downloader.download(url)
            data,urls=self.paeser.parse(html)
            self.data_saver.save(data)
            self.url_manger.add_new_urls(urls)


if __name__ == '__main__':
    diao_du = DiaoDu()
    diao_du.run('https://www.qius')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小蜗笔记

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值