Scrapy downloaderMiddlewares
pip install curl_cffi
pip install pyhttpx
downloaderMiddlewares.py
import requests, pyhttpx, random
from curl_cffi import requests as cffi_requests
impersonates = ["chrome99", "chrome101", "chrome107", "chrome110", "edge99", "edge101"]
proxy = {
'PROXY_USER': "xxx",
'PROXY_PASS': "xxx",
'PROXY_SERVER': "http://ip:port"
}
def get_proxys():
proxy_host = proxy.get('PROXY_SERVER').rsplit(
':', maxsplit=1)[0].split('//')[-1]
proxy_port = proxy.get('PROXY_SERVER').rsplit(':', maxsplit=1)[-1]
proxy_username = proxy.get('PROXY_USER')
proxy_pwd = proxy.get('PROXY_PASS')
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxy_host,
"port": proxy_port,
"user": proxy_username,
"pass": proxy_pwd,
}
proxies = {
'http': proxyMeta,
'https': proxyMeta,
}
return proxies
def bypass_ja3_get(url, headers, proxies, timeout=15, data={}):
resp = None
try:
sess1 = requests.session()
sess1.headers = headers
sess1.proxies = proxies
resp = sess1.get(url, data=data, allow_redirects=False, timeout=timeout)
print('sess1: ', resp.status_code)
if resp.status_code != 200: raise TimeoutError
except:
try:
sess2 = pyhttpx.HttpSession(browser_type='chrome', http2=True)
resp = sess2.get(url, proxies=proxies, allow_redirects=False, timeout=timeout)
print('sess2: ', resp.status_code)
if resp.status_code != 200: raise TimeoutError
except:
try:
sess3 = cffi_requests.Session()
resp = sess3.get(url, proxies=proxies, impersonate=random.choice(impersonates), timeout=timeout)
print('sess3: ', resp.status_code)
except:
pass
return resp
class ByPassJa3RequestMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': '',
'Connection': 'close'
}
@defer.inlineCallbacks
def process_request(self, request, spider):
container=[]
out = defer.Deferred()
reactor.callInThread(self._get_res, request, container, out, spider)
yield out
if len(container)>0: defer.returnValue(container[0])
def _get_res(self,request,container,out,spider):
try:
url= request.url
meta = request.meta
proxies= meta.get('proxy')
if request.headers: self.headers= {k.decode():v[0].decode() for k, v in request.headers.items()}
r = utils.bypass_ja3_get(url, self.headers, proxies)
print('{} StatusCode: '.format(spider.name), r.status_code)
resp = Response(url=url,status=r.status_code,body=r.content,encoding=r.encoding,request=request)
container.append(resp)
reactor.callFromThread(out.callback, resp)
except Exception as e:
print('{} Error: '.format(spider.name), e)
err=str(type(e))+' '+str(e)
reactor.callFromThread(out.errback, ValueError(err))
spider.py
custom_settings['DOWNLOADER_MIDDLEWARES'] = {
'Material.middlewares.downloaderMiddlewares.ByPassJa3RequestMiddleware': 400,
}
yield scrapy.Request(url, meta={'proxy': utils.get_proxys(self.proxy)}, callback=self.parse_data, errback=self.errback_response)