python 多进程爬虫案例

二十九、python多进程爬虫案例

综合案例:

1将前面的网页爬虫数据保存到MongoDB数据库

2、使用多进程(使用生产者消费者模型)

 

代码:

import requests

from lxml import etree

import time

from pymongo import MongoClient

from multiprocessing import Process, Queue

 

class DoubanSpider(object):

    @staticmethod

    def getAllPageCount():

        response = requests.get('https://read.douban.com/kind/1?start=0')

        selector = etree.HTML(response.text)

        allPageCount = selector.xpath('//div[@class="pagination"]/ul/li[9]/a/text()')[0]

        alls = (int)(allPageCount.encode('utf8'))

        return alls * 20 - 20   #每页20本书,从0开始计算

 

    # 获取豆瓣的

    #https://read.douban.com/kind/1?start=0&sort=hot&promotion_only=False&min_price=None&max_price=None&works_type=None

    @staticmethod

    def getBooksOfAllList(basurl='https://read.douban.com/kind/1?start={0}'):

        start = 20  #查询的起始位置

        end = DoubanSpider.getAllPageCount()   #查询的最后一页的起始位置

        print start, end

        while start <= end:

            url = basurl.format(start)

            print url

            response = requests.get(url)

            selector = etree.HTML(response.text)

            all = selector.xpath('//ul[@class="list-lined ebook-list column-list"]/li[@class="item store-item"]')

 

            for a in all:

                title = a.xpath('div[@class="info"]/div[@class="title"]/a/text()')[0]

                author = a.xpath('div[@class="info"]/p[1]/span/span[2]/a/text()')[0]

                # 判断空

                translater = a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a')

                if not translater:

                    translater = None

                else:

                    translater = a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a/text()')[0]

                rate = a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]')

                if not rate:

                    rate = None

                else:

                    rate = a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]/text()')[0]

                bref = a.xpath('div[@class="info"]/div[@class="article-desc-brief"]/text()')[0]

                # print title, '=====', author, '====', translater, '====',  rate, '====', bref

                yield title, author, translater, rate, bref

            start += 20

            print start

 

    #获取所有的url列表

    @staticmethod

    def getURLList(basurl='https://read.douban.com/kind/1?start={0}'):

        start = 0  # 查询的起始位置

        end = 56620#DoubanSpider.getAllPageCount()  # 查询的最后一页的起始位置

        print start, end

        while start <= end:

            url = basurl.format(start)

            # print url

            yield url

            start += 20

 

    #根据url抓取页面内容

    @staticmethod

    def getContentByUrl(url):

        print url

        response = requests.get(url)

        selector = etree.HTML(response.text)

        all = selector.xpath('//ul[@class="list-lined ebook-list column-list"]/li[@class="item store-item"]')

        for a in all:

            title = a.xpath('div[@class="info"]/div[@class="title"]/a/text()')[0]

            author = a.xpath('div[@class="info"]/p[1]/span/span[2]/a/text()')[0]

            # 判断空

            translater = a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a')

            if not translater:

                translater = None

            else:

                translater = \

                a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a/text()')[0]

            rate = a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]')

            if not rate:

                rate = None

            else:

                rate = \

                a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]/text()')[0]

            bref = a.xpath('div[@class="info"]/div[@class="article-desc-brief"]/text()')[0]

            # print title, '=====', author, '====', translater, '====',  rate, '====', bref

            yield title, author, translater, rate, bref

 

 

#生产者类

class DouBanProducer(Process):

    def __init__(self, q):

        Process.__init__(self)

        self._q = q

 

    def run(self):

        for url in DoubanSpider.getURLList():

            self._q.put(url)

            print url

        self._q.put(None)  #为咯消费者获取到该值就终结该进程,但是多个消费者的时候一个是有问题的?

 

class DouBanConsumer(Process):

    def __init__(self, q):

        Process.__init__(self)

        self._q = q

 

    def run(self):

        url = None

        while True:

            try:

                url = self._q.get()

            except:

                time.sleep(1)

                continue

            if url:

                self.saveBooks(url)

            else:

                break

    #将抓取的书保存到mongodb

    def saveBooks(self, url):

        conn = MongoClient('192.168.216.7')

        db = conn.qianfeng

        doubanbooks = db.doubanbooks

        print doubanbooks

        lable = ['title', 'author', 'translater', 'rate', 'bref']

        books = []

        for value in DoubanSpider.getContentByUrl(url):

            book = dict(zip(lable, value))

            books.append(book)

            print books

        doubanbooks.insert_many(books, ordered=False)

        conn.close()

 

if __name__ == "__main__":

    dbs = DoubanSpider()

    #普通测试1

    # for title, author, translater , rate, bref in DoubanSpider.getBooksOfAllList():

    #     print title, author, translater, rate, bref

 

    #普通测试2

    # for url in dbs.getURLList():

    #     for title, author, translater, rate, bref in dbs.getContentByUrl(url):

    #         print title, author, translater, rate, bref

 

 

    #多进程的抓取

    q = Queue()

    producer = DouBanProducer(q)

    consumers = [DouBanConsumer(q) for i in range(4)]

    producer.start()

    for c in consumers:

        c.start()

 

    for c in consumers:

        c.join()

 

 

 

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大数据东哥(Aidon)

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值