import json
from scrapy import signals
import asyncio
from curl_cffi.requests import AsyncSession
from scrapy.http import HtmlResponse
# import requests_go
# from curl_cffi import requests as curl_cffi_request
# import cloudscraper
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
import sys
if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
import nest_asyncio
nest_asyncio.apply()
class CurlAsyncDownloaderMiddleware:
def __init__(self, max_concurrency=10):
self.max_concurrency = max_concurrency
self.timeout = settings.getfloat('DOWNLOAD_TIMEOUT')
@classmethod
def from_crawler(cls, crawler):
return cls(
max_concurrency=crawler.settings.getint('CONCURRENT_REQUESTS')
)
def get_proxy(self):
if settings.get('PROXIES'):
proxies = settings['PROXIES']
else:
proxies = {'http': "http://host:port", 'https': "http://host:port"}
return proxies
async def _fetch(self, request, spider):
# 根据request.url 和 spider.name 判断请求方式,get, post
headers = request.meta['headers'] # 可以从meta传headers参数
#headers = request.headers.to_unicode_dict()#从yield scrapy.Request传headers参数
flag = False
async with AsyncSession() as session:
for i in range(10):
try:
proxies = self.get_proxy()
cookies = {}
# 读取cookies
if hasattr(spider, 'redis_server'):
redis_server = spider.redis_server
redis_cookie_key = spider.redis_cookie_key
cookies0 = redis_server.get(redis_cookie_key)
cookies = json.loads(cookies0) if cookies0 else {}
params = request.meta.get('params')
print('params:', params)
if cookies:
if params:
response = await session.get(request.url, impersonate="chrome110", headers=headers, params=params, cookies=cookies, proxies=proxies, timeout=self.timeout)
else:
response = await session.get(request.url, impersonate="chrome110", headers=headers, cookies=cookies, proxies=proxies, timeout=self.timeout)
else:
if params:
response = await session.get(request.url, impersonate="chrome110", headers=headers, params=params, proxies=proxies, timeout=self.timeout)
else:
response = await session.get(request.url, impersonate="chrome110", headers=headers, proxies=proxies, timeout=self.timeout)
print('status_code:', response.status_code)
flag = True
return HtmlResponse(
url=request.url,
status=response.status_code,
body=response.content,
request=request,
encoding=response.encoding
)
except Exception as e:
print(e)
if not flag:
return HtmlResponse(url=request.url, status=500, request=request)
def process_request(self, request, spider):
print('spider_name1:', spider.name)
#print(request.headers.to_unicode_dict())
loop = asyncio.get_event_loop()
get_future1 = asyncio.ensure_future(self._fetch(request, spider))
loop.run_until_complete(get_future1)
# print(get_future1.result().text)
return get_future1.result()
def process_response(self, request, response, spider):
print('response_status:', response.status)
# if response.status == 400:
# print(dir(response))
# print('process_response', response.text)
# print(response.request.headers)
return response
def process_exception(self, request, exception, spider):
# 处理异常
pass
# scrapy 重试
from scrapy.downloadermiddlewares.retry import RetryMiddleware, response_status_message
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost, TCPTimedOutError
from urllib3.exceptions import ProtocolError, ProxyError, ProxySchemeUnknown
from twisted.web.client import ResponseFailed
from scrapy.core.downloader.handlers.http11 import TunnelError
import logging
import time
import random
class MyRetryMiddleware(RetryMiddleware):
logger = logging.getLogger(__name__)
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError, ProtocolError, ProxyError, ProxySchemeUnknown)
def get_proxy(self):
http_proxy = 'http://127.0.0.1:8090'
return http_proxy
def process_response(self, request, response, spider):
# if request.meta.get('dont_retry', False): # 不需要重试,返回response
# return response
proxy_ip = request.meta.get('proxy')
if proxy_ip and not response.body: # 更换代理
request.meta['proxy'] = self.get_proxy()
return request
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
time.sleep(random.randint(3, 5))
logging.info('返回值异常, 更换代理IP进行重试...')
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
time.sleep(random.randint(3, 5))
logging.warning('连接异常, 进行重试...')
request.meta['proxy'] = self.get_proxy()
return self._retry(request, exception, spider)