python爬网页、爬到前几个就不动了_python scrapy 爬取起点小说,爬虫停止在第四页不动了...

如题,我在测试爬虫的时候,终端出现了如下代码:

2019-04-20 15:04:51 [scrapy.core.engine] DEBUG: Crawled (200) (referer: https://www.qidian.com/free/a... ['cached']

2019-04-20 15:04:52 [qd] DEBUG: Hi, this is an item page! https://www.qidian.com/free/a...

2019-04-20 15:05:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 4 pages/min), scraped 40 items (at 40 items/min)

2019-04-20 15:06:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)

2019-04-20 15:07:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)

2019-04-20 15:08:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)

setting代码:BOT_NAME = 'qidian'

SPIDER_MODULES = ['qidian.spiders']

NEWSPIDER_MODULE = 'qidian.spiders'

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

DOWNLOADER_MIDDLEWARES = {

'qidian.middlewares.QidianDownloaderMiddleware': 543,

'qidian.middlewares.RandomUserAgentMiddlware': 300,

'qidian.middlewares.ProxyMiddleware': 125,

'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,

}

ITEM_PIPELINES = {

'qidian.pipelines.QidianPipeline': 300,

'scrapy_redis.pipelines.RedisPipeline':100

}

MONGODB_HOST = '127.0.0.1'

MONGODB_PORT = 27017

MONGODB_DBNAME = 'qidian'

MONGODB_DOCNAME = 'novel_free'

MONGODB_USER = 'root'

MONGODB_PSW = 'root'

#redis config

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

SCHEDULER_PERSIST = False

SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'

REDIS_URL = None

REDIS_HOST = '127.0.0.1'

REDIS_PORT = 6379

dq.py  代码:import scrapy

from scrapy.spiders import CrawlSpider,Rule

from scrapy.linkextractors import LinkExtractor

from qidian.items import QidianItem

from scrapy_redis.spiders import RedisCrawlSpider

import time

class QdSpider(RedisCrawlSpider):

name = 'qd'

allowed_domains = ['qidian.com']

# start_urls = ['https://www.qidian.com/free/all']

redis_key = 'qidian:start_urls'

rules = {

Rule(LinkExtractor(restrict_css='li.lbf-pagination-item > a.lbf-pagination-next'),

callback='parse_data', follow=True)

}

def parse_data(self, response):

self.log('Hi, this is an item page! %s' % response.url)

for book_info in response.xpath('//ul[@class="all-img-list cf"]/li'):

yield{

"title":book_info.xpath('./div[2]/h4/a/text()').extract_first().strip(),

"author":book_info.xpath('./div[2]/p[1]/a[1]/text()').extract_first().strip(),

"brief":book_info.xpath('./div[2]/p[2]/text()').extract_first().strip(),

"url":'http:' + book_info.xpath('./div[2]/h4/a/@href').extract_first()

}

下载中间键:from scrapy import signals

import random

from fake_useragent import UserAgent

class QidianSpiderMiddleware(object):

# Not all methods need to be defined. If a method is not defined,

# scrapy acts as if the spider middleware does not modify the

# passed objects.

@classmethod

def from_crawler(cls, crawler):

# This method is used by Scrapy to create your spiders.

s = cls()

crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

return s

def process_spider_input(self, response, spider):

# Called for each response that goes through the spider

# middleware and into the spider.

# Should return None or raise an exception.

return None

def process_spider_output(self, response, result, spider):

# Called with the results returned from the Spider, after

# it has processed the response.

# Must return an iterable of Request, dict or Item objects.

for i in result:

yield i

def process_spider_exception(self, response, exception, spider):

# Called when a spider or process_spider_input() method

# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Response, dict

# or Item objects.

pass

def process_start_requests(self, start_requests, spider):

# Called with the start requests of the spider, and works

# similarly to the process_spider_output() method, except

# that it doesn’t have a response associated.

# Must return only requests (not items).

for r in start_requests:

yield r

def spider_opened(self, spider):

spider.logger.info('Spider opened: %s' % spider.name)

class QidianDownloaderMiddleware(object):

# Not all methods need to be defined. If a method is not defined,

# scrapy acts as if the downloader middleware does not modify the

# passed objects.

@classmethod

def from_crawler(cls, crawler):

# This method is used by Scrapy to create your spiders.

s = cls()

crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

return s

def process_request(self, request, spider):

# Called for each request that goes through the downloader

# middleware.

# Must either:

# - return None: continue processing this request

# - or return a Response object

# - or return a Request object

# - or raise IgnoreRequest: process_exception() methods of

# installed downloader middleware will be called

return None

def process_response(self, request, response, spider):

# Called with the response returned from the downloader.

# Must either;

# - return a Response object

# - return a Request object

# - or raise IgnoreRequest

return response

def process_exception(self, request, exception, spider):

# Called when a download handler or a process_request()

# (from other downloader middleware) raises an exception.

# Must either:

# - return None: continue processing this exception

# - return a Response object: stops process_exception() chain

# - return a Request object: stops process_exception() chain

pass

def spider_opened(self, spider):

spider.logger.info('Spider opened: %s' % spider.name)

class ProxyMiddleware(object):

'''

设置ip

'''

def __init__(self,ip):

self.ip = ip

@classmethod

def from_crawler(cls,crawler):

return cls(ip=crawler.settings.get('PROXIES'))

def process_request(self, request, spider):

ip = random.choice(self.ip)

request.meta['proxy'] = ip

class RandomUserAgentMiddlware(object):

'''

随机更换user-agent

模仿并替换site-package/scrapy/downloadermiddlewares源代码中的

useragent.py中的UserAgentMiddleware类

'''

def __init__(self, crawler):

super(RandomUserAgentMiddlware, self).__init__()

self.ua = UserAgent()

#可读取在settings文件中的配置,来决定开源库ua执行的方法,默认是random,也可是ie、Firefox等等

self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")

@classmethod

def from_crawler(cls, crawler):

return cls(crawler)

#更换用户代理逻辑在此方法中

def process_request(self, request, spider):

def get_ua():

return getattr(self.ua, self.ua_type)

print(get_ua())

request.headers.setdefault('User-Agent', get_ua())

item :import scrapy

class QidianItem(scrapy.Item):

# define the fields for your item here like:

title = scrapy.Field()

author = scrapy.Field()

brief = scrapy.Field()

url = scrapy.Field()我这里使用了redis ,其实是想弄一个分布式爬虫的,然而问题不断~~多开爬虫也不会出现并行爬取数据,只有一个爬虫在跑,这个问题也很纠结啊,有大神帮帮忙吗 ????

想请教的问题就是两个,1,为什么爬虫只爬了3页就阻塞了?

2,为什么爬虫多开,只有一个在跑,其他的在监听状态

![图片描述][1]

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值