如题,我在测试爬虫的时候,终端出现了如下代码:
2019-04-20 15:04:51 [scrapy.core.engine] DEBUG: Crawled (200) (referer: https://www.qidian.com/free/a... ['cached']
2019-04-20 15:04:52 [qd] DEBUG: Hi, this is an item page! https://www.qidian.com/free/a...
2019-04-20 15:05:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 4 pages/min), scraped 40 items (at 40 items/min)
2019-04-20 15:06:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)
2019-04-20 15:07:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)
2019-04-20 15:08:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)
setting代码:BOT_NAME = 'qidian'
SPIDER_MODULES = ['qidian.spiders']
NEWSPIDER_MODULE = 'qidian.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DOWNLOADER_MIDDLEWARES = {
'qidian.middlewares.QidianDownloaderMiddleware': 543,
'qidian.middlewares.RandomUserAgentMiddlware': 300,
'qidian.middlewares.ProxyMiddleware': 125,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
ITEM_PIPELINES = {
'qidian.pipelines.QidianPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline':100
}
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'qidian'
MONGODB_DOCNAME = 'novel_free'
MONGODB_USER = 'root'
MONGODB_PSW = 'root'
#redis config
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = False
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
dq.py 代码:import scrapy
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from qidian.items import QidianItem
from scrapy_redis.spiders import RedisCrawlSpider
import time
class QdSpider(RedisCrawlSpider):
name = 'qd'
allowed_domains = ['qidian.com']
# start_urls = ['https://www.qidian.com/free/all']
redis_key = 'qidian:start_urls'
rules = {
Rule(LinkExtractor(restrict_css='li.lbf-pagination-item > a.lbf-pagination-next'),
callback='parse_data', follow=True)
}
def parse_data(self, response):
self.log('Hi, this is an item page! %s' % response.url)
for book_info in response.xpath('//ul[@class="all-img-list cf"]/li'):
yield{
"title":book_info.xpath('./div[2]/h4/a/text()').extract_first().strip(),
"author":book_info.xpath('./div[2]/p[1]/a[1]/text()').extract_first().strip(),
"brief":book_info.xpath('./div[2]/p[2]/text()').extract_first().strip(),
"url":'http:' + book_info.xpath('./div[2]/h4/a/@href').extract_first()
}
下载中间键:from scrapy import signals
import random
from fake_useragent import UserAgent
class QidianSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class QidianDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ProxyMiddleware(object):
'''
设置ip
'''
def __init__(self,ip):
self.ip = ip
@classmethod
def from_crawler(cls,crawler):
return cls(ip=crawler.settings.get('PROXIES'))
def process_request(self, request, spider):
ip = random.choice(self.ip)
request.meta['proxy'] = ip
class RandomUserAgentMiddlware(object):
'''
随机更换user-agent
模仿并替换site-package/scrapy/downloadermiddlewares源代码中的
useragent.py中的UserAgentMiddleware类
'''
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua = UserAgent()
#可读取在settings文件中的配置,来决定开源库ua执行的方法,默认是random,也可是ie、Firefox等等
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
#更换用户代理逻辑在此方法中
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
print(get_ua())
request.headers.setdefault('User-Agent', get_ua())
item :import scrapy
class QidianItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
author = scrapy.Field()
brief = scrapy.Field()
url = scrapy.Field()我这里使用了redis ,其实是想弄一个分布式爬虫的,然而问题不断~~多开爬虫也不会出现并行爬取数据,只有一个爬虫在跑,这个问题也很纠结啊,有大神帮帮忙吗 ????
想请教的问题就是两个,1,为什么爬虫只爬了3页就阻塞了?
2,为什么爬虫多开,只有一个在跑,其他的在监听状态
![图片描述][1]