from scrapy import signals
from fake_useragent import UserAgent
classRandomUserAgentMiddleware(object):def__init__(self,user_agent):
self.user_agent=user_agent
self.ua = UserAgent()
@classmethoddeffrom_crawler(cls, crawler):# 修改设置构造新的spider
s = cls(crawler.settings['USER_AGENT'])
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_request(self, request, spider):# 请求前,随机UA
request.headers['User-Agent']=self.ua.random
# 设置代理
request.meta['proxy']='http://127.0.0.1:9743'defprocess_response(self,request,response,spider):# 请求后修改状态码,再返回response到spider
response.status=201return response # Nonedefprocess_exception(request,exception,spider):# 异常执行print('======>',repr(exception))returnNone#Response,Request
middlewares.py
SPIDER_MIDDLEWARES
from scrapy.exceptions import CloseSpider
classClose_spider(object):defprocess_spider_input(self,response,spider):# response被处理前执行ifnot200<= response.status <=300:print('失败 url(%s) stcode(%s)'%(response.url,response.status))# raise CloseSpider('%s爬虫异常,退出!'%response.url)returnNonedefprocess_spider_output(self,response,result,spider):# spider处理response返回结构后执行for res in result:yield res #iterdefprocess_spider_exception(self,response,exception,spider):# 异常执行print('======>',repr(exception))returnNone#Response,iter
from selenium import webdriver
from selenium webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
classSeleniumMiddleware(object):def__init__(self,timeout=None):
self.logger=getLogger(__name__)
self.timeout=timeout
self.bw=webdriver.Chrome()
self.bw.set_window_size(1400,700)
self.bw.set_page_load_timeout(self.timeout)
self.wait=WebDriverWait(self.bw,self.timeout)def__def__(self):
self.bw.close()
@classmethoddeffrom_crawler(cls,crawler):return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'))defprocess_request(self,request,spider):
self.logger.debug('Chorme is starting')try:
self.get(request.url)
self.wait.until(EC.presence_of_element_located((By.ID,'q')))return HtmlResponse(url=request.url,body=self.bw.page_source,request=request,encoding='utf-8',status=200)except TimeoutException as e:print(repr(e))return HtmlResponse(url=request.url,status=500,request=reqest)