cloudflare 5s后跳转的网页怎么爬取

最新推荐文章于 2023-08-23 09:44:54 发布

Memory_and_Dream

最新推荐文章于 2023-08-23 09:44:54 发布

阅读量2.1k

点赞数 2

文章标签：爬虫

本文链接：https://blog.csdn.net/Memory_and_Dream/article/details/100004157

版权

有很多网站使用了cloudflare 的反爬虫服务，第一次打开任何页面都需要等待5s才能进入后面的页面。这种页面需要解析js才能获取到跳转参数。完成跳转后才能获取有效cookie。

不喜欢bb，直接上代码了。

# -*- coding: utf-8 -*-
# @Time    : 2019/8/21 20:48
# @Author  : meng_zhihao
# @Email   : 312141830@qq.com
# @File    : five_seconds_redirect.py

'curl "https://steamdb.info/cdn-cgi/l/chk_jschl?s=a008fbe38534ed25da1fcfeee8818c71088155e2-1566391545-1800-AS6fBv4Md5hbFH5KuOu3rUO53K8YLifU6bByW039xKgE^%^2BB^%^2Fl3rJNXQjLqvAq^%^2FCNSWqfrbNCiBprNC4fTtXfmasWS20yWx2vBKGjya^%^2BTVhU8PsS8myK8ty1gUsqY7iuvZmw^%^3D^%^3D^&jschl_vc=bd263529a25342dfc2bf2d06ec6a32f9^&pass=1566391549.871-VHEZj8fTNM^&jschl_answer=16.6665600261" -H "authority: steamdb.info" -H "pragma: no-cache" -H "cache-control: no-cache" -H "upgrade-insecure-requests: 1" -H "user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36" -H "accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" -H "referer: https://steamdb.info/" -H "accept-encoding: gzip, deflate, br" -H "accept-language: zh-CN,zh;q=0.9" -H "cookie: __cfduid=d1117a0185a634e26f5f076daff94a2c01566391545" --compressed'

import requests
from lxml import etree
import re
from js2py import eval_js
import time
import urllib
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}

def getXpath(xpath, content):  # xptah操作貌似会把中文变成转码&#xxxx;  /text()变unicode编码
    tree = etree.HTML(content)
    out = []
    results = tree.xpath(xpath)
    for result in results:
        if 'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)):
            out.append(result)
        else:
            out.append(etree.tostring(result))
    return out

def get_js_return(content):
    """
    <form id="challenge-form" action="/cdn-cgi/l/chk_jschl" method="get">
    <input type="hidden" name="s" value="ae726e4647ab3b3b01d94096731f54a8790f6efd-1566391399-1800-AdbLiBHhoiVj/h6fOJjYpSxtdWOagK1Tf6odwTO48h/zioQb5GeZpZPrBs6erx/LCt71vDspouXXPGKGNHj4zNgRhJI+v6O7OfCieoFjQsub5lKF14hkq25Fl/LZVA1mLg=="></input>
    <input type="hidden" name="jschl_vc" value="e94efa15a4a362916dbf1ba54d3d3813"/>
    <input type="hidden" name="pass" value="1566391403.392-qBtSP7u9nZ"/>
    <input type="hidden" id="jschl-answer" name="jschl_answer"/>
    """

    # jschl_vc = "bd263529a25342dfc2bf2d06ec6a32f9"
    # passwd = "1566391549.871-VHEZj8fTNM"
    # jschl_answer = "16.6665600261"

    l = re.findall(r'name="jschl_vc" value="(.*?)"', content)
    if l: jschl_vc = l[0]

    l = re.findall(r'name="pass" value="(.*?)"', content)
    if l: passwd = l[0]

    m = re.search(r'setTimeout\(function\(\)\{((?:.|\n)*?)f\.submit\(\)', content)
    if m:
        s = m.group(1)
        l = s.split("\n")
        l = [i for i in l if i.split()]
        first = l[0]
        last = l[-2]
        _ret = re.search(r"(.*?)a\.value\s+=\s+((.*?)121')", last)
        if _ret:
            last = _ret.group(1)
            ret = _ret.group(2)
        js = "function f(){ %s  %s  %s  return %s }" % (
            first,
            """
            t = 'https://steamdb.info/';
            r = t.match(/https?:\/\//)[0];
            t = t.substr(r.length); t = t.substr(0,t.length-1);
          """, last,ret)
        jschl_answer = eval_js(js)()
        print(jschl_answer)

    return jschl_vc, passwd, jschl_answer

def get_cookie():
    se = requests.session()
    page = se.get('https://steamdb.info/',timeout=10,headers=HEADERS).content.decode('utf8')
    print(page)
    l = re.findall(r'name="s" value="(.*?)"', page)
    if l:
        s = l[0]
        s = urllib.parse.quote(s)
    else:
        raise Exception
    jschl_vc, passwd, jschl_answer = get_js_return(page)
    print(s,jschl_vc, passwd, jschl_answer)
    time.sleep(4)
    url = 'https://steamdb.info/cdn-cgi/l/chk_jschl?s=%s&jschl_vc=%s&pass=%s&jschl_answer=%s'%(s,jschl_vc,passwd,jschl_answer)
    print(url)
    se.get(url,timeout=10,headers=HEADERS)
    time.sleep(2)
    new_page = se.get('https://steamdb.info/',timeout=10,headers=HEADERS,).content.decode('utf8')
    print(new_page) # 完整的页面
    cookie = se.cookies.get_dict() # 完整的cookie
    print(cookie)
    return cookie

if __name__ == '__main__':
    get_cookie()

Memory_and_Dream

关注

2
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
cloudflare 5s后跳转的网页怎么爬取

有很多网站使用了cloudflare 的反爬虫服务，第一次打开任何页面都需要等待5s才能进入后面的页面。这种页面需要解析js才能获取到跳转参数。完成跳转后才能获取有效cookie。不喜欢bb，直接上代码了。# -*- coding: utf-8 -*-# @Time : 2019/8/21 20:48# @Author : meng_zhihao# @Email : 3121...
复制链接

扫一扫