基于scrapy架构的miniScrapy实现爬取豆瓣电影top250

Scrapy的架构

在这里插入图片描述
———————————————————————————
参考链接:https://www.cnblogs.com/miaoning/p/11626563.html

miniSrcapy实例代码:

import requests
import json
from lxml import etree
from queue import  Queue
import threading

class CrawlThread(threading.Thread):
    """
    爬虫类
    """
    def __init__(self,thread_id,queue):
        super().__init__()
        self.thread_id = thread_id
        self.queue = queue
        self.headers ={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 ' \
            'Safari/537.36'
        }

    def run(self):
        # 重写run方法
        print (f'启动线程:{self.thread_id}')
        self.scheduler()
        print (f'结束线程:{self.thread_id}')

    # 模拟任务调度
    def scheduler(self):
        while not self.queue.empty():
            # 队列不为空
            page = self.queue.get()
            print (f'下载线程:{self.thread_id},下载页面:{page}')
            url = f'https://book.douban.com/top250?start={page*25}'

            try:
                # downloader 下载器
                response = requests.get(url,headers=self.headers)
                dataQueue.put(response.text)
            except Exception as e:
                print (f'下载错误:{e}')

class ParserThread(threading.Thread):
    """
    分析类
    """
    def __init__(self,thread_id,queue,file):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.queue = queue
        self.file = file

    def run(self):
        print (f'启动线程:{self.thread_id}')
        while flag:
            try:
                item = self.queue.get(False)
                if not item:
                    continue
                self.parse_data(item)
                self.queue.task_done()
            except Exception as e:
                pass
        print (f'结束线程:{self.thread_id}')

    def parse_data(self,item):
        """
        解析网页内容的函数
        :param item:
        :return:
        """
        try:
            html = etree.HTML(item)
            books = html.xpath('//div[@class="pl2"]')
            for book in books:
                try:
                    title = book.xpath('./a/text()')
                    link = book.xpath('./a/@href')
                    response = {
                        'title':title,
                        'link':link
                    }
                    # 解析方法和scrapy相同,再构造一个json
                    json.dump(response,fp=self.file,ensure_ascii=False)
                except  Exception as e:
                    print (f'book error:{e}')

        except Exception as e:
            print(f'page error:{e}')




if __name__ == '__main__':
    # 定义存放网页的任务队列
    pageQueue = Queue(20)
    for page in range(0,11):
        pageQueue.put(page)

    # 定义存放解析数据的任务队列
    dataQueue = Queue()

    # 爬虫线程
    crawl_threads = []
    crawl_name_list = ['crawl_1','crawl_2','crawl_3']
    for thread_id in crawl_name_list:
        thread = CrawlThread(thread_id,pageQueue)
        thread.start()
        crawl_threads.append(thread)

    # 将结果保存到一个json文件中
    with open('book.json','a',encoding='utf-8') as pipeline_f:

        # 解析线程
        parse_thread = []
        parse_name_list = ['parse_1','parse_2','parse_3']
        flag = True
        for thread_id in parse_name_list:
            thread = ParserThread(thread_id,dataQueue,pipeline_f)
            thread.start()
            parse_thread.append(thread)

        # 结束crawl线程
        for t in crawl_threads:
            t.join()

        # 结束parse线程
        flag = False
        for t in parse_thread:
            t.join()


    print ('退出主线程')

———————————————————————————
关于多线程的参考链接:https://www.runoob.com/python/python-multithreading.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值