有很多网站使用了cloudflare 的反爬虫服务,第一次打开任何页面都需要等待5s才能进入后面的页面。这种页面需要解析js才能获取到跳转参数。完成跳转后才能获取有效cookie。
不喜欢bb,直接上代码了。
# -*- coding: utf-8 -*-
# @Time : 2019/8/21 20:48
# @Author : meng_zhihao
# @Email : 312141830@qq.com
# @File : five_seconds_redirect.py
'curl "https://steamdb.info/cdn-cgi/l/chk_jschl?s=a008fbe38534ed25da1fcfeee8818c71088155e2-1566391545-1800-AS6fBv4Md5hbFH5KuOu3rUO53K8YLifU6bByW039xKgE^%^2BB^%^2Fl3rJNXQjLqvAq^%^2FCNSWqfrbNCiBprNC4fTtXfmasWS20yWx2vBKGjya^%^2BTVhU8PsS8myK8ty1gUsqY7iuvZmw^%^3D^%^3D^&jschl_vc=bd263529a25342dfc2bf2d06ec6a32f9^&pass=1566391549.871-VHEZj8fTNM^&jschl_answer=16.6665600261" -H "authority: steamdb.info" -H "pragma: no-cache" -H "cache-control: no-cache" -H "upgrade-insecure-requests: 1" -H "user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36" -H "accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" -H "referer: https://steamdb.info/" -H "accept-encoding: gzip, deflate, br" -H "accept-language: zh-CN,zh;q=0.9" -H "cookie: __cfduid=d1117a0185a634e26f5f076daff94a2c01566391545" --compressed'
import requests
from lxml import etree
import re
from js2py import eval_js
import time
import urllib
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
def getXpath(xpath, content): # xptah操作貌似会把中文变成转码&#xxxx; /text()变unicode编码
tree = etree.HTML(content)
out = []
results = tree.xpath(xpath)
for result in results:
if 'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)):
out.append(result)
else:
out.append(etree.tostring(result))
return out
def get_js_return(content):
"""
<form id="challenge-form" action="/cdn-cgi/l/chk_jschl" method="get">
<input type="hidden" name="s" value="ae726e4647ab3b3b01d94096731f54a8790f6efd-1566391399-1800-AdbLiBHhoiVj/h6fOJjYpSxtdWOagK1Tf6odwTO48h/zioQb5GeZpZPrBs6erx/LCt71vDspouXXPGKGNHj4zNgRhJI+v6O7OfCieoFjQsub5lKF14hkq25Fl/LZVA1mLg=="></input>
<input type="hidden" name="jschl_vc" value="e94efa15a4a362916dbf1ba54d3d3813"/>
<input type="hidden" name="pass" value="1566391403.392-qBtSP7u9nZ"/>
<input type="hidden" id="jschl-answer" name="jschl_answer"/>
"""
# jschl_vc = "bd263529a25342dfc2bf2d06ec6a32f9"
# passwd = "1566391549.871-VHEZj8fTNM"
# jschl_answer = "16.6665600261"
l = re.findall(r'name="jschl_vc" value="(.*?)"', content)
if l: jschl_vc = l[0]
l = re.findall(r'name="pass" value="(.*?)"', content)
if l: passwd = l[0]
m = re.search(r'setTimeout\(function\(\)\{((?:.|\n)*?)f\.submit\(\)', content)
if m:
s = m.group(1)
l = s.split("\n")
l = [i for i in l if i.split()]
first = l[0]
last = l[-2]
_ret = re.search(r"(.*?)a\.value\s+=\s+((.*?)121')", last)
if _ret:
last = _ret.group(1)
ret = _ret.group(2)
js = "function f(){ %s %s %s return %s }" % (
first,
"""
t = 'https://steamdb.info/';
r = t.match(/https?:\/\//)[0];
t = t.substr(r.length); t = t.substr(0,t.length-1);
""", last,ret)
jschl_answer = eval_js(js)()
print(jschl_answer)
return jschl_vc, passwd, jschl_answer
def get_cookie():
se = requests.session()
page = se.get('https://steamdb.info/',timeout=10,headers=HEADERS).content.decode('utf8')
print(page)
l = re.findall(r'name="s" value="(.*?)"', page)
if l:
s = l[0]
s = urllib.parse.quote(s)
else:
raise Exception
jschl_vc, passwd, jschl_answer = get_js_return(page)
print(s,jschl_vc, passwd, jschl_answer)
time.sleep(4)
url = 'https://steamdb.info/cdn-cgi/l/chk_jschl?s=%s&jschl_vc=%s&pass=%s&jschl_answer=%s'%(s,jschl_vc,passwd,jschl_answer)
print(url)
se.get(url,timeout=10,headers=HEADERS)
time.sleep(2)
new_page = se.get('https://steamdb.info/',timeout=10,headers=HEADERS,).content.decode('utf8')
print(new_page) # 完整的页面
cookie = se.cookies.get_dict() # 完整的cookie
print(cookie)
return cookie
if __name__ == '__main__':
get_cookie()