二十九、python多进程爬虫案例
综合案例:
1、将前面的网页爬虫数据保存到MongoDB数据库
2、使用多进程(使用生产者消费者模型)
代码:
import requests
from lxml import etree
import time
from pymongo import MongoClient
from multiprocessing import Process, Queue
class DoubanSpider(object):
@staticmethod
def getAllPageCount():
response = requests.get('https://read.douban.com/kind/1?start=0')
selector = etree.HTML(response.text)
allPageCount = selector.xpath('//div[@class="pagination"]/ul/li[9]/a/text()')[0]
alls = (int)(allPageCount.encode('utf8'))
return alls * 20 - 20 #每页20本书,从0开始计算
# 获取豆瓣的
#https://read.douban.com/kind/1?start=0&sort=hot&promotion_only=False&min_price=None&max_price=None&works_type=None
@staticmethod
def getBooksOfAllList(basurl='https://read.douban.com/kind/1?start={0}'):
start = 20 #查询的起始位置
end = DoubanSpider.getAllPageCount() #查询的最后一页的起始位置
print start, end
while start <= end:
url = basurl.format(start)
print url
response = requests.get(url)
selector = etree.HTML(response.text)
all = selector.xpath('//ul[@class="list-lined ebook-list column-list"]/li[@class="item store-item"]')
for a in all:
title = a.xpath('div[@class="info"]/div[@class="title"]/a/text()')[0]
author = a.xpath('div[@class="info"]/p[1]/span/span[2]/a/text()')[0]
# 判断空
translater = a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a')
if not translater:
translater = None
else:
translater = a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a/text()')[0]
rate = a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]')
if not rate:
rate = None
else:
rate = a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]/text()')[0]
bref = a.xpath('div[@class="info"]/div[@class="article-desc-brief"]/text()')[0]
# print title, '=====', author, '====', translater, '====', rate, '====', bref
yield title, author, translater, rate, bref
start += 20
print start
#获取所有的url列表
@staticmethod
def getURLList(basurl='https://read.douban.com/kind/1?start={0}'):
start = 0 # 查询的起始位置
end = 56620#DoubanSpider.getAllPageCount() # 查询的最后一页的起始位置
print start, end
while start <= end:
url = basurl.format(start)
# print url
yield url
start += 20
#根据url抓取页面内容
@staticmethod
def getContentByUrl(url):
print url
response = requests.get(url)
selector = etree.HTML(response.text)
all = selector.xpath('//ul[@class="list-lined ebook-list column-list"]/li[@class="item store-item"]')
for a in all:
title = a.xpath('div[@class="info"]/div[@class="title"]/a/text()')[0]
author = a.xpath('div[@class="info"]/p[1]/span/span[2]/a/text()')[0]
# 判断空
translater = a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a')
if not translater:
translater = None
else:
translater = \
a.xpath('div[@class="info"]/p[1]/span[@class="meta-item"]/span[@class="labeled-text"]/a/text()')[0]
rate = a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]')
if not rate:
rate = None
else:
rate = \
a.xpath('div[@class="info"]/div[@class="rating list-rating"]/span[@class="rating-average"]/text()')[0]
bref = a.xpath('div[@class="info"]/div[@class="article-desc-brief"]/text()')[0]
# print title, '=====', author, '====', translater, '====', rate, '====', bref
yield title, author, translater, rate, bref
#生产者类
class DouBanProducer(Process):
def __init__(self, q):
Process.__init__(self)
self._q = q
def run(self):
for url in DoubanSpider.getURLList():
self._q.put(url)
print url
self._q.put(None) #为咯消费者获取到该值就终结该进程,但是多个消费者的时候一个是有问题的?
class DouBanConsumer(Process):
def __init__(self, q):
Process.__init__(self)
self._q = q
def run(self):
url = None
while True:
try:
url = self._q.get()
except:
time.sleep(1)
continue
if url:
self.saveBooks(url)
else:
break
#将抓取的书保存到mongodb中
def saveBooks(self, url):
conn = MongoClient('192.168.216.7')
db = conn.qianfeng
doubanbooks = db.doubanbooks
print doubanbooks
lable = ['title', 'author', 'translater', 'rate', 'bref']
books = []
for value in DoubanSpider.getContentByUrl(url):
book = dict(zip(lable, value))
books.append(book)
print books
doubanbooks.insert_many(books, ordered=False)
conn.close()
if __name__ == "__main__":
dbs = DoubanSpider()
#普通测试1
# for title, author, translater , rate, bref in DoubanSpider.getBooksOfAllList():
# print title, author, translater, rate, bref
#普通测试2
# for url in dbs.getURLList():
# for title, author, translater, rate, bref in dbs.getContentByUrl(url):
# print title, author, translater, rate, bref
#多进程的抓取
q = Queue()
producer = DouBanProducer(q)
consumers = [DouBanConsumer(q) for i in range(4)]
producer.start()
for c in consumers:
c.start()
for c in consumers:
c.join()