响应码521与国家企业信用信息公示系统js解析

所谓的521是网络在请求是返回的状态码为521，并且反回一段js，js执行后会生成一段cookie，携带cookie再次向服务器发送请求，才可以请求成功，而且ip和cookie绑定，切换ip需要重新获取cookie

原理还是比较简单的，难度在于js的执行，如果使用selenium这个问题还是比较容易处理，但是爬虫讲究的是速度和高效，本文深度剖析一下521中返回的js具体执行过程

以国家企业信用信息公示系统为例，在我们请求第一次网站主页的时候返回状态码521，并同时返回js，附图：

看起来还是比较乱的，复制出来，放到sublime中，自动调整一下格式

需要稍微有一点点js基础，然后结合经验，对这段代码也能看出个大概。这段js代码先是定义了两个参数和两个方法，最后有一个while循环，在while循环中有个eval()方法，js中的eval和python中的eval还是有点区别的，度娘是这么解释的：eval() 函数可计算某个字符串，并执行其中的的 JavaScript 代码。

据此可以判断上面代码eval里面这一段"y.replace(/\b\w+\b/g, function(y) {return x[f(y, z) - 1] || ("_" + y)})"会生成一段新的js，eval会将这段新的js执行。

python中有个三方库PyExecJS可以执行js，只要js中没有用到浏览器中的对象就可以执行，在运行时需要需要指定函数入口，并且这个函数需要有返回内容，所以需要将上面的while循环修改成成所需函数，而函数返回的内容就是新生成的js，直接上代码

js_fun = re.match(  "<script>([\s\S]+)while\(z\+\+\)try", response.text)replace_str="z++;"js_script = js_fun.group(1) + replace_str+"""function fun_1(){return y.replace(/\\b\w+\\b/g, function(y){return x[f(y, z) - 1] || ('_' + y)})}"""entrance = "fun_1"ctx = execjs.compile(js_script)result = ctx.call(entrance)

定义了一个fun_1的函数，并将eval里面的内容用return返回，将result打印出来并经过sublime格式化，新生成的js代码就是下面这个样子

var _3 = function() {    setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')', 1500);    document.cookie = '__jsl_clearance=1563436679.494|0|' + (function() {          var _s = [function(_3) {              return _3            }, function(_s) {              return _s            }, (function() {              var _3 = document.createElement('div');              _3.innerHTML = '<a href=\'/\'>_D</a>';              _3 = _3.firstChild.href;              var _s = _3.match(/https?:\/\//)[0];              _3 = _3.substr(_s.length).toLowerCase();              return function(_s) {                for (var _D = 0; _D < _s.length; _D++) {                  _s[_D] = _3.charAt(_s[_D])                };                return _s.join('')              }            })(), function(_3) {              return eval('String.fromCharCode(' + _3 + ')')            }],            _D = ['p', [((+![][                []              ]) + []) + [-~{} + (-~[] + [-~-~{}] >> -~-~{})]],              [(-~[(-~!/!/ << -~!/!/)] + []) + [7]], (-~-~{} + []), 'F', [                [-~!/!/ - ~(((-~!/!/ << -~!/!/) << -~[]))] + (-~[-~[                  [(+![][                    []                  ]) + (+![][                    []                  ])] * (-~~~'' - ~-~{})                ]] + [] + [                  []                ][0]), (-~[-~[                  [(+![][                    []                  ]) + (+![][                    []                  ])] * (-~~~'' - ~-~{})                ]] + [] + [                  []                ][0]) + [-~{} + (-~[] + [-~-~{}] >> -~-~{})]              ], 'G', [                [7] + [-~{} + (-~[] + [-~-~{}] >> -~-~{})]              ], 'T', [((+![][                []              ]) + []) + (-~-~{} + []) + (-~-~{} + [])], (window['callP' + 'hantom'] + []).charAt(~~{}), 'B31OeL', [(-~[-~[                [(+![][                  []                ]) + (+![][                  []                ])] * (-~~~'' - ~-~{})              ]] + [] + [                []              ][0]) + (-~[(-~!/!/ << -~!/!/)] + [])],              [                [-~{} + (-~[] + [-~-~{}] >> -~-~{})]              ], '1', [-~{}                /(+![])+[]+[[]][0]][0].charAt(7),'cY',[[-~!/!/-~(((-~!/!/<<-~!/!/)<<-~[]))]+[7]],'T',[((+![][[]])+[])+[~~{}]],'7',[((+![][[]])+[])+[~~{}]+[7],(-~[(-~!/!/<<-~!/!/)]+[])+[7]],(-~[(-~!/!/<<-~!/!/)]+[]),'D'];for(var _3=0;_3<_D.length;_3++){_D[_3]=_s[[1,2,3,0,1,3,1,3,1,3,0,1,3,2,1,0,1,3,1,2,1,3,0,1][_3]](_D[_3])};return _D.join('')})()+';Expires=Thu, 18-Jul-19 08:57:59 GMT;Path=/;                '};if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('                DOMContentLoaded ',_3,false)}else{document.attachEvent('                onreadystatechange ',_3)}

注意看第三行代码

在网站中可以可以看到,请求成功后会有两个cookie，分别为_jsluid_h和__jsl_clearance，其中_jsluid_h中第一次请求的时候后端给set的，第二个cookie就是这段js生成的，而且这两段cookie是一对，只有互相对应才能使用，其中_jsluid_h是你和当前ip有关系，所以更换了ip之后也不能使用。

后面这段代码不能直接用execjs执行，因为代码里面有document和window等对象，这些对象都是浏览器对象，所以直接用execjs执行会报错，所以需要将里面会导致报错的参数一一替换，代码如下

flag = 0while 1:    flag+=1    if flag>=5:        break    js_fun = re.match(        "<script>([\s\S]+)while\(z\+\+\)try", response.text)    replace_str+="z++;"    js_script = js_fun.group(        1) + replace_str+"""function fun_1() {return y.replace(/\\b\w+\\b/g, function(y) {return x[f(y, z) - 1] || ('_' + y)})}"""    entrance = "fun_1"    ctx = execjs.compile(js_script)    result = ctx.call(entrance)    if "document.cookie" in result:        result = re.search(            "__jsl_clearance=([\s\S]+?)'\+\(function\(\)([\s\S]+)(return[\s\S]+?join\([\s\S]+?\))", result)        end_js = "function aaa()" + re.sub("\(window\[[\s\S]+?\]\+\[\]\)", "undefined", re.sub(            "\[window\[[\s\S]+?\]\+\[\]\]", "undefined", result.group(2).replace("window.headless", "undefined"))) + result.group(3) + "}"
        end_js = re.sub("!window\[[\s\S]+?\]","1",end_js)        r_end_js = re.search("(var([\s\S]{1,5}?)=document.createElement[\s\S]+firstChild.href[\s\S]+?\[0\])[\s\S]+?;return function\(([\s\S]{1,5})\)",end_js)                            if r_end_js:                end_js = end_js.replace(r_end_js.group(1),f"var {r_end_js.group(2)}='http://www.gsxt.gov.cn';var {r_end_js.group(3)}='http://'")        end_result = execjs.compile(end_js)        result1 = end_result.call("aaa")        s.cookies["__jsl_clearance"] = result.group(1) + result1        print(s.cookies)        break        # return s,proxies    elif flag>=5:        break

上面这段代码在请求的时候使用了代理ip，所以使用的时候需要自行替换，其中比较核心的地方是第23行代码，这行代码替换了原来js中的12行代码，这个地方会验证你当前浏览器的url，如果这样不替换代码不会报错，而且会生成和正确结果非常相似的cookie导致请求不成功，最后将cookie打印出来

如下：

然后用这两段cookie，并且用第一次请求的ip，重新去请求主页，得到结果

可以看出主页已经请求成功啦

import sys
sys.path.append("..")

from _proxies import get_ip  #代理ip
import execjs
import re
import requests
url = "http://www.gsxt.gov.cn/index.htm"

def format_s(s):
    return {item.split(':', 1)[0].strip(): item.split(':', 1)[1].strip() for item in s.split('\n') if item}

headers = format_s("""
	Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
	Accept-Encoding: gzip, deflate
	Accept-Language: zh-CN,zh;q=0.9
	Connection: keep-alive
	Host: www.gsxt.gov.cn
	Upgrade-Insecure-Requests: 1
	User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36""")
while  1:
	
	try:
		ip = get_ip()
		s = requests.session()
		response = s.get(url,headers = headers,proxies = ip,timeout = 3)
		print(response.status_code)
		if response.status_code ==521:
			# print(response.text)
			break
	except Exception as e:
		print(e)
replace_str = ""

flag = 0
while 1:
    try:
        flag+=1
        if flag>=5:
            break
        js_fun = re.match(
            "<script>([\s\S]+)while\(z\+\+\)try", response.text)
        replace_str+="z++;"
        js_script = js_fun.group(
            1) + replace_str+"""function fun_1() {return y.replace(/\\b\w+\\b/g, function(y) {return x[f(y, z) - 1] || ('_' + y)})}"""
        entrance = "fun_1"
        ctx = execjs.compile(js_script)
        result = ctx.call(entrance)
        if "document.cookie" in result:
            result = re.search(
                "__jsl_clearance=([\s\S]+?)'\+\(function\(\)([\s\S]+)(return[\s\S]+?join\([\s\S]+?\))", result)
            end_js = "function aaa()" + re.sub("\(window\[[\s\S]+?\]\+\[\]\)", "undefined", re.sub(
                "\[window\[[\s\S]+?\]\+\[\]\]", "undefined", result.group(2).replace("window.headless", "undefined"))) + result.group(3) + "}"

            end_js = re.sub("!window\[[\s\S]+?\]","1",end_js)
            r_end_js = re.search("(var([\s\S]{1,5}?)=document.createElement[\s\S]+firstChild.href[\s\S]+?\[0\])[\s\S]+?;return function\(([\s\S]{1,5})\)",end_js)                    
            if r_end_js:    
                end_js = end_js.replace(r_end_js.group(1),f"var {r_end_js.group(2)}='http://www.gsxt.gov.cn';var {r_end_js.group(3)}='http://'")
            end_result = execjs.compile(end_js)
            result1 = end_result.call("aaa")
            s.cookies["__jsl_clearance"] = result.group(1) + result1
            print(s.cookies)
            break
            #请求主页
            # resposne = s.get("http://www.gsxt.gov.cn/index.htm",proxies = ip,headers = headers)
            # print(resposne.text)

            # return s,proxies
        elif flag>=5:
            break
    except:
        pass