Redis实现分布式爬虫

最新推荐文章于 2024-06-23 16:01:36 发布

wu_hai1

最新推荐文章于 2024-06-23 16:01:36 发布

阅读量314

点赞数 2

本文链接：https://blog.csdn.net/wu_hai1/article/details/81748938

版权

将单机爬虫jobbole修改为分布式爬虫

伯乐在线爬虫如下：

blog.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import JobboleItem
from ..items import ArticleItemLoader
class BlogSpider(scrapy.Spider):
name = 'blog'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
# 需求：获取所有文章的标题图片地址时间详情页地址收藏点赞评论
def parse(self, response):
item_list = response.xpath('//div[@class="post floated-thumb"]')
for item in item_list :
img = item.xpath('.//div[@class="post-thumb"]/a/img/@src').extract_first('')
url = item.xpath('.//a[@class="archive-title"]/@href').extract_first('')
yield scrapy.Request(url=url,meta={'img':img},callback=self.get_detail_with_url)
# next_url = response.xpath('//a[@class="next page-numbers"]/@href').extract()
# if len(next_url) != 0 :
# page_url = next_url[0]
# yield scrapy.Request(url=page_url,callback=self.parse)
def get_detail_with_url(self ,response):
# img = response.meta['img']
# # 标题
# title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first('')
# #时间
# date_time = response.xpath('//div[@class="entry-meta"]/p/text()').extract_first('')
# time = date_time.split('·')[0].strip()
#
# # 详情页地址
# detail_url = response.url
#
# # 点赞数
# dian_zan = response.xpath('//h10/text()').extract_first('')
#
# # 收藏数
# book_mark = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract_first('')
#
# book_mark_array = book_mark.split(' ')
# book_mark_num = 0
# if len(book_mark_array[1]) != 0:
# book_mark_num = int(book_mark_array[1])
#
# # 评论数
# comment = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first('')
# comment_arr = comment.split(' ')
# comment_num = 0
# if len(comment_arr[1]) != 0:
# comment_num = int(comment_arr[1])
#
# item = JobboleItem()
# item['img'] = img
# item['title'] = title
# item['detail_url'] = detail_url
# item['date_time'] = time
# item['dian_zan'] = dian_zan
# item['book_mark'] = book_mark_num
# item['comment'] = comment_num
# 创建ItemLoader的实例化对象的时候
# 需要传入两个参数
# 参数1：item的实例化对象 item里面为还要提取的数据的字段
# 参数2：网页的源码
item_loader = ArticleItemLoader(item=JobboleItem(),response=response)
# add_xpath()用于给一个field设置值
# 后面需要追加两个参数
# 参数1；需要设置的field的名称
# 参数2：xpath路径
item_loader.add_xpath('title','//div[@class="entry-header"]/h1/text()')
item_loader.add_xpath('date_time','//div[@class="entry-meta"]/p/text()')
item_loader.add_xpath('dian_zan','//div[@class="post-adds"]//h10/text()')
item_loader.add_xpath('book_mark','//span[contains(@class,"bookmark-btn")]/text()')
item_loader.add_xpath('comment','//a[@href="#article-comment"]/span/text()')
item_loader.add_value('img',[response.meta['img']])
item_loader.add_value('detail_url',response.url)
# 将itemloader加载器中保存的每一个field数据收集起来
# 赋值给item 并且返回到管道
item = item_loader.load_item()
yield item

items.py

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose ,TakeFirst
import re
# itemload是分离数据的另外一种方式使用itemloader加载器
# 有这样一些优势：
# 1.默认使用xpath()/css()这种数据提取方式
# 是将数据的提取和数据的过滤等过程放在一个函数中
# 采用itemloader这种数据加载方式
# 可以将数据的提取和分离分成两部分
# 让代码更加清晰，代码更加整洁
# 2.可以将数据的处理函数，单独定义
# 也可以对一个数据使用多个处理函数
# 这样的话对代码的重用有着非常好的实现
def changeTitle(value):
value = '标题:' + value
return value
def getNewTime(value):
newTime = value.split('·')[0]
newTime = newTime.strip()
return newTime
def getNum(value):
pattern = re.compile(r'\d+')
result = re.findall(pattern , value)
if result :
return int(result[0])
else :
return 0
# 使用itemloader的话需要先继承itemloadder
class ArticleItemLoader(ItemLoader):
# default_output_processor 设置输出内容的类型
# TakeFirst获取所有数据当中的第一条数据
# 默认返回的数据为一个列表列表当中有一条数据
# default_output_processor = ItemLoader.default_output_processor
default_output_processor = TakeFirst()
# list = ['hello world']
#
# list = list
#
# list = list[0]
class JobboleItem(scrapy.Item):
# define the fields for your item here like:
img = scrapy.Field()
title = scrapy.Field(
# 如果函数以Map...开头那么内部很大可能是一个可迭代对象
# 在此处 MapCompose括号里面可以追加多个参数每个参数都是一个函数
# 那么获取的内容会依次进入到每个函数当中被执行
# title map-reduce
input_processor = MapCompose(changeTitle ,lambda x : x+'------------------')
)
date_time = scrapy.Field(
input_processor = MapCompose(getNewTime)
)
detail_url = scrapy.Field(
)
dian_zan = scrapy.Field(
)
book_mark = scrapy.Field(
input_processor = MapCompose(getNum)
)
comment = scrapy.Field(
input_processor=MapCompose(getNum)
)

pipelines.py

# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.pipelines.images import ImagesPipeline
class JobbolePipeline(object):
# def __init__(self):
# # localhost
# self.connect = pymysql.connect(host='localhost',
# user='root',
# password='123456',
# db='jobbole',
# port=3306)
# self.cursor = self.connect.cursor()
def process_item(self, item, spider):
# self.cursor.execute('insert into blog (img , title ,detail_url ,time ,dian_zan,book_mark,comment) VALUES ("{}","{}","{}","{}","{}","{}","{}")'.
# format(item['img'],item['title'],item['detail_url'],item['date_time'],item['dian_zan'],item['book_mark'],item['comment']))
#
# self.connect.commit()
return item
# def close_spider(self ,spider):
# self.cursor.close()
# self.connect.close()
class jobboleDownImage(ImagesPipeline):
def get_media_requests(self, item, info):
pass
# 用来下载图片使用图片链接
def file_path(self, request, response=None, info=None):
path = ''
return path
# def test(a=1,b=2):
#
# print('123')
# test(1,2)
# test(b=2 ,a = 1)

settings.py

# -*- coding: utf-8 -*-
# Scrapy settings for jobbole project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'jobbole'
SPIDER_MODULES = ['jobbole.spiders']
NEWSPIDER_MODULE = 'jobbole.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jobbole (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'jobbole.middlewares.JobboleSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'jobbole.middlewares.JobboleDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'jobbole.pipelines.JobbolePipeline': 300,
# 'jobbole.pipelines.jobboleDownImage':1
}
# IMAGES_STORE = ''
# scrapy crawl blog -o wenfeng.json -s FEED_EXPORT_ENCODEING=utf-8
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'