最近在写gsxt的爬虫, 在当初分析请求的时候,老是遇到返回521,基础的反爬都用上了,还是没用, 最后找到了咱们亲爱的“度娘”和谷歌,才知道他们升级了最新的加速乐的最新爬虫防护机制。
防护机制分析:
像gsxt发送请求,初次返回521,而且返回的response.text是一串令人脑阔痛的js代码,因为request.get并支持执行js代码,所以导致拿不到合法的cookie值,
第一次不携带cookie访问返回的代码如下(js代码):
<script>var x="A@for@@@toLowerCase@@@replace@@chars@@JLYcv@join@@new@@RegExp@eval@1500@setTimeout@fromCharCode@toString@19@GMT@a@firstChild@@W@1@return@else@f@addEventListener@@@3@charAt@@length@document@captcha@2FP@0xFF@@@36@@@@@while@50@attachEvent@@window@DOMContentLoaded@onreadystatechange@div@pathname@location@Array@@@22@@09@zct@function@search@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@String@2@858@8@@href@@@@try@substr@g@@Jul@@charCodeAt@catch@@569@0xEDB88320@k@https@@challenge@Expires@@match@headless@@reverse@createElement@@@@var@Path@innerHTML@7@Mon@if@__jsl_clearance@cookie@false@@07@1563785407@e@split@@JgSe0upZ@@parseInt@@0@@6@d@".replace(/@*$/,"").split("@"),y="410 424=233(){40('220.301=220.214+220.234.13(/[\\?|&]131-334/,\\'\\')',34);130.422='421=431.324|444|'+(233(){410 222=[233(424){110 424},233(222){110 222},233(424){110 33('241.41('+424+')')}],21=[(-~[]+[]+[])+[1001],(242+(+!-[])+(-~{}<<-~{})+[]),[(+[])],(-~[]+[]+[]),(-~[]+[]+[])+(-~~~[]+((+!-[])|-~-~~~[])+[]+[[]][444]),(-~[]+[]+[])+[121],(-~[]+[]+[])+(-~-~~~[]+[]),(-~[]+[]+[])+(242+(+!-[])+(-~{}<<-~{})+[]),(-~[]+[]+[])+[-~[-~[]-~((-~{}<<(-~{}<<-~{})))]],(-~~~[]+((+!-[])|-~-~~~[])+[]+[[]][444]),[121],[-~[-~[]-~((-~{}<<(-~{}<<-~{})))]],(-~~~[]+[~~'']-(-~~~[])+[]+[[]][444]),(-~[-~-~~~[]]-~~~[]-~~~[]+((+!-[])|-~-~~~[])+[[]][444]),(-~-~~~[]+[]),[1001],(-~[]+[]+[])+[(+[])],(-~[]+[]+[])+(-~[]+[]+[])];2(410 424=444;424<21.124;424++){21[424]=222[[104,444,104,444,104,444,104,242,104,242,104,444,104,242,104,242,104,242][424]](['%132',[((+!-[])+(-~{}<<-~{}))/~~{}+[]+[[]][444]][444].122((-~!{}-~!{}^-~!{})-~~~[]+((+!-[])|-~-~~~[])),'331','243',[[-~[-~[]-~((-~{}<<(-~{}<<-~{})))]]+[1001]],(-~~~[]+((+!-[])|-~-~~~[])+[]+[[]][444])+[210.343+[[]][444]][444].122(242),[[-~[-~[]-~((-~{}<<(-~{}<<-~{})))]]+(-~[]+[]+[])],[((+!-[])+(-~{}<<-~{}))/~~{}+[]+[[]][444]][444].122((-~!{}-~!{}^-~!{})-~~~[]+((+!-[])|-~-~~~[])),[(-~[]+[]+[])+(-~-~~~[]+[])+[(+[])]],'242','1%121',[[1001]+(-~[-~-~~~[]]-~~~[]-~~~[]+((+!-[])|-~-~~~[])+[[]][444])],'232',[(+[])],'413',[(-~[]+[]+[])+(-~[]+[]+[])+[121]],'103','22'][21[424]])};110 21.23('')})()+';340=414, 224-314-43 231:202:430 44;411=/;'};420((233(){310{110 !!210.113;}322(432){110 423;}})()){130.113('211',424,423)}111{130.203('212',424)}",f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},z=f(y.match(/\w/g).sort(function(x,y){return f(x)-f(y)}).pop());while(z++)try{eval(y.replace(/\b\w+\b/g, function(y){return x[f(y,z)-1]||("_"+y)}));break}catch(_){}</script>
返回的head如下:
HTTP/1.1 521 | |
Server | nginx |
Date | Mon, 22 Jul 2019 08:50:07 GMT |
Transfer-Encoding | chunked |
X-Via-JSL | 0038d5f,- |
Set-Cookie | __jsluid_h=d483c025a673d00fc718f5748aa38451; max-age=31536000; path=/; HttpOnly |
Proxy-Connection | keep-alive |
response返回的状态码为521,并且其set_cookie只传递了__jsluid_h这一个值, 经分析还有一个__jsl_clearance就是在上面那段js代码里,要携带这两个值,然后一起访问,才能访问的到。
反防护机制的破解:
返回这段js代码后,咱们要先给他构造一下,构造成可执行的js代码如下:
// function getJLS(){
// var x= "40@Ix6@1563781216@@try@D@for@div@search@innerHTML@@Jul@charAt@var@document@6@@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@join@split@@while@length@@captcha@addEventListener@@callP@@charCodeAt@Array@@@__jsl_clearance@@https@@2@replace@0xEDB88320@@onreadystatechange@Expires@f@d@String@if@@@a@@challenge@1500@@function@@0xFF@08@@U9xtQft@@DOMContentLoaded@@@eval@@0@JgSe0upZ@@attachEvent@fromCharCode@g@@window@@cookie@@substr@@@match@@c@href@@1@Path@GMT@@@hantom@22@setTimeout@else@@toString@SmHF@@toLowerCase@createElement@Mon@RegExp@@8@16@new@@return@@reverse@@@Grq@firstChild@09@19@@pathname@e@36@@chars@@catch@false@@parseInt@location".replace(/@*$/,"").split("@"),
// y="12 2b=47(){79('a8.70=a8.9a+a8.9.33(/[\\?|&]21-44/,\\'\\')',45);13.64='2a=3.97|57|'+(47(){12 6a=[47(2b){90 2b},47(6a){90 6a},47(2b){90 55('3a.5b('+2b+')')}],82=[(-~!!62['24'+'77']+[]+[[]][57])+[~~[]],[88],[-~-~~~{}-~-~~~{}],[14],[(-~[]+[~~[]])/[-~-~~~{}]],[~~[]],((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[]),[-~[]+[(+!{})]-(-~[])],(-~!!62['24'+'77']+[]+[[]][57])+(-~!!62['24'+'77']+[]+[[]][57]),[-~~~{}-~~~{}],(-~!!62['24'+'77']+[]+[[]][57]),((-~{}|(-~!!62['24'+'77']<<-~!!62['24'+'77']))+[])];7(12 2b=57;2b<82.1b;2b++){82[2b]=6a[[72,32,72,57,72,57,72,32,72,32,57,72][2b]]([[14],((-~{}|(-~!!62['24'+'77']<<-~!!62['24'+'77']))+[]),[((-~{}|(-~!!62['24'+'77']<<-~!!62['24'+'77']))+[])+((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])],'6','50','95',({}+[[]][57]).11(-~~~{})+[!~~''+[]+[[]][57]][57].11((+!{})),'81',[(-~!!62['24'+'77']+[]+[[]][57])+(-~!!62['24'+'77']+[]+[[]][57])+[88],(-~!!62['24'+'77']+[]+[[]][57])+(-~!!62['24'+'77']+[]+[[]][57])+((-~{}|(-~!!62['24'+'77']<<-~!!62['24'+'77']))+[]),[88]+[~~[]],[88]+[-~-~~~{}-~-~~~{}],[88]+((-~{}|(-~!!62['24'+'77']<<-~!!62['24'+'77']))+[])],[((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])+[-~~~{}-~~~{}]],'2','6b'][82[2b]])};90 82.17('')})()+';37=85, 78-10-98 4a:1:89 74;73=/;'};3b((47(){5{90 !!62.22;}a4(9b){90 a5;}})()){13.22('52',2b,a5)}7a{13.5a('36',2b)}",
// f=function(x,y){
// var a=0,
// b=0,
// c=0;
// x=x.split("");
// y=y||99;
// while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;
// return c
// },z=f(y.match(/\w/g).sort(function(x,y){
// return f(x)-f(y)
// }).pop());
// while(z++)
// try{
// eval(y.replace(/\b\w+\b/g,
// function(y){
// return x[f(y,z)-1]||("_"+y)}));
// break
// }catch(_){
// }
// }
构造过后,懂得js的肯定一眼就看到了这段代码
try{ return(y.replace(/\b\w+\b/g, function(y){ return x[f(y,z)-1]||("_"+y)})); break }catch(_){ }
eval就是执行解密后的JS代码,首先我们想办法把解密后的代码提取出来:
将上面那段构造好的js代码里的eval替换为return,用execjs来执行这段代码,于是正常代码就出来了(如下):
var _2b=function(){setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);document.cookie='__jsl_clearance=1563781216.09|0|'+(function(){var _6a=[function(_2b){return _2b},function(_6a){return _6a},function(_2b){return eval('String.fromCharCode('+_2b+')')}],_82=[(-~!!window['callP'+'hantom']+[]+[[]][0])+[~~[]],[8],[-~-~~~{}-~-~~~{}],[6],[(-~[]+[~~[]])/[-~-~~~{}]],[~~[]],((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[]),[-~[]+[(+!{})]-(-~[])],(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0]),[-~~~{}-~~~{}],(-~!!window['callP'+'hantom']+[]+[[]][0]),((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])];for(var _2b=0;_2b<_82.length;_2b++){_82[_2b]=_6a[[1,2,1,0,1,0,1,2,1,2,0,1][_2b]]([[6],((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[]),[((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])+((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])],'D','U9xtQft','Grq',({}+[[]][0]).charAt(-~~~{})+[!~~''+[]+[[]][0]][0].charAt((+!{})),'SmHF',[(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0])+[8],(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0])+((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[]),[8]+[~~[]],[8]+[-~-~~~{}-~-~~~{}],[8]+((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])],[((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])+[-~~~{}-~~~{}]],'Ix6','c'][_82[_2b]])};return _82.join('')})()+';Expires=Mon, 22-Jul-19 08:40:16 GMT;Path=/;'};if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',_2b,false)}else{document.attachEvent('onreadystatechange',_2b)}
然后 咱们从里面找到咱们所要的__jsl_clearance这个参数, 就是下面这段:
'__jsl_clearance=1563781216.09|0|'+(function(){ var _6a=[function(_2b){ return _2b },function(_6a){ return _6a },function(_2b){ return eval('String.fromCharCode('+_2b+')') }],_82=[(-~!!window['callP'+'hantom']+[]+[[]][0])+[~~[]],[8],[-~-~~~{}-~-~~~{}],[6],[(-~[]+[~~[]])/[-~-~~~{}]],[~~[]],((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[]),[-~[]+[(+!{})]-(-~[])],(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0]),[-~~~{}-~~~{}],(-~!!window['callP'+'hantom']+[]+[[]][0]),((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])]; for(var _2b=0;_2b<_82.length;_2b++){_82[_2b]=_6a[[1,2,1,0,1,0,1,2,1,2,0,1][_2b]]([[6],((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[]),[((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])+((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])],'D','U9xtQft','Grq',({}+[[]][0]).charAt(-~~~{})+[!~~''+[]+[[]][0]][0].charAt((+!{})),'SmHF',[(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0])+[8],(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0])+((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[]),[8]+[~~[]],[8]+[-~-~~~{}-~-~~~{}],[8]+((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])],[((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])+[-~~~{}-~~~{}]],'Ix6','c'][_82[_2b]])}; return _82.join('')}
由此可以看出,__jsl_clearance是由两部分组成的,前面一段特定的,和后面那段js代码生成的,咱们主要看后面这段,着他构造一下,构造成可执行的js代码如下:
function _2b(){ window = {}; var _6a=[function(_2b){ return _2b },function(_6a){ return _6a },function(_2b){ return eval('String.fromCharCode('+_2b+')') }],_82=[(-~!!window['callP'+'hantom']+[]+[[]][0])+[~~[]],[8],[-~-~~~{}-~-~~~{}],[6],[(-~[]+[~~[]])/[-~-~~~{}]],[~~[]],((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[]),[-~[]+[(+!{})]-(-~[])],(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0]),[-~~~{}-~~~{}],(-~!!window['callP'+'hantom']+[]+[[]][0]),((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])]; for(var _2b=0;_2b<_82.length;_2b++){_82[_2b]=_6a[[1,2,1,0,1,0,1,2,1,2,0,1][_2b]]([[6],((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[]),[((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])+((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])],'D','U9xtQft','Grq',({}+[[]][0]).charAt(-~~~{})+[!~~''+[]+[[]][0]][0].charAt((+!{})),'SmHF',[(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0])+[8],(-~!!window['callP'+'hantom']+[]+[[]][0])+(-~!!window['callP'+'hantom']+[]+[[]][0])+((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[]),[8]+[~~[]],[8]+[-~-~~~{}-~-~~~{}],[8]+((-~{}|(-~!!window['callP'+'hantom']<<-~!!window['callP'+'hantom']))+[])],[((-~{}+[(-~~~{}-~~~{})*[-~~~{}-~~~{}]]>>-~{})+[]+[])+[-~~~{}-~~~{}]],'Ix6','c'][_82[_2b]])}; return _82.join('')}
由于js中有window对象, 所以咱们在构造的时候给他定义一个window对象,然后再用execjs执行一下,ojbk,__jsl_clearance就出来了,
到此加速乐就快乐的完成了爆菊,有不懂得地方 欢迎提问。