遇见一个网站采集,无论怎样都返回空数据(实际上是有数据的),但是抓包下来又确实是那样的,请教了一些人推测是指纹验证,拜读了网上其他大佬的博客文章后实验了一下,发现确实是这个问题!
第一次知道tcp还有这个东西,让我大受震撼,值此搬运一下。
参考链接及来源:
Python 爬虫进阶必备 | JA3 指纹在爬虫中的应用与定向突破
python爬虫 requests、httpx、aiohttp、scrapy突破ja3指纹识别
实例:
requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
import requests
import random
ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
class DESAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
"""
A TransportAdapter that re-enables 3DES support in Requests.
"""
CIPHERS = ORIGIN_CIPHERS.split(':')
random.shuffle(CIPHERS)
CIPHERS = ':'.join(CIPHERS)
self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5'
super().__init__(*args, **kwargs)
def init_poolmanager(self, *args, **kwargs):
context = create_urllib3_context(ciphers=self.CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
def proxy_manager_for(self, *args, **kwargs):
context = create_urllib3_context(ciphers=self.CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
s = requests.Session()
s.headers.update(headers)
s.mount('https://ja3er.com', DESAdapter())
resp = s.get('https://ja3er.com/json').json()
print(resp)
aiohttp
import random
import ssl
import asyncio
import aiohttp
# ssl._create_default_https_context = ssl._create_unverified_context
ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
class SSLFactory:
def __init__(self):
self.ciphers = ORIGIN_CIPHERS.split(":")
def __call__(self) -> ssl.SSLContext:
random.shuffle(self.ciphers)
ciphers = ":".join(self.ciphers)
ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
context = ssl.create_default_context()
context.set_ciphers(ciphers)
return context
sslgen = SSLFactory()
async def main():
async with aiohttp.ClientSession() as session:
async with session.get("https://ja3er.com/json", headers={}, ssl=sslgen()) as resp:
data = await resp.json()
print(data)
asyncio.get_event_loop().run_until_complete(main())
httpx:
异步模式:
import httpx
import asyncio
import random
import ssl
ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
class SSLFactory:
def __init__(self):
self.ciphers = ORIGIN_CIPHERS.split(":")
def __call__(self) -> ssl.SSLContext:
random.shuffle(self.ciphers)
ciphers = ":".join(self.ciphers)
ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
context = ssl.create_default_context()
context.set_ciphers(ciphers)
return context
sslgen = SSLFactory()
async def main():
async with httpx.AsyncClient(verify=sslgen()) as client:
resp = await client.get('https://ja3er.com/json')
result = resp.json()
print(result)
asyncio.run(main())
同步模式:
import httpx
import asyncio
import random
import ssl
ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
class SSLFactory:
def __init__(self):
self.ciphers = ORIGIN_CIPHERS.split(":")
def __call__(self) -> ssl.SSLContext:
random.shuffle(self.ciphers)
ciphers = ":".join(self.ciphers)
ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
context = ssl.create_default_context()
context.set_ciphers(ciphers)
return context
sslgen = SSLFactory()
with httpx.Client(headers={}, http2=True, verify=sslgen()) as client:
response = client.get('https://ja3er.com/json')
print(response.text)
scrapy
class MyHTTPDownloadHandler(HTTPDownloadHandler):
def shuffle_ciphers(self):
self.ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
CIPHERS = self.ORIGIN_CIPHERS.split(':')
random.shuffle(CIPHERS)
CIPHERS = ':'.join(CIPHERS) + ':!aNULL:!eNULL:!MD5'
return CIPHERS
def download_request(self, request, spider):
tls_ciphers = self.shuffle_ciphers()
self._contextFactory = ScrapyClientContextFactory(tls_ciphers=tls_ciphers)
return super().download_request(request, spider)
爬虫配置文件
custom_settings = {
"CONCURRENT_REQUESTS": 5,
"DOWNLOAD_DELAY": 1,
"DOWNLOAD_TIMEOUT": 10,
"RETRY_TIMES": 3,
"DOWNLOAD_HANDLERS": {
'http': 'scrapy_project.middlewares.MyHTTPDownloadHandler',
'https': 'scrapy_project.middlewares.MyHTTPDownloadHandler',
}
}