欢迎访问github
最近做爬虫时碰到了521错误,500开头的都是服务器错误,521错误有很大可能是请求头参数不对,比如下面这个
这是错误的
这是正确的
就是这一堆东西可能有哪些少了或是错误,而在爬虫中遇到这个错误,又极大可能是少了Cookie参数,比如在这里就是少了Cookie参数。
展开那个Cookie
Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; _ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839acc0-1544419245
可以看到就是两个字段,一个yd_cookie,还有一个_ydclearance。那不就是少这两个参数嘛,yd_cookie可以看到在第一次错误的响应头中已经有了
Set-Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; Expires=1544419245; Path=/; HttpOnly
那只要正则截取下及好了,这样就只剩下一个_ydclearance,这个怎么解决呢,首先用各种拦截请求的工具,我是用的Firefox,直接看请求
可以看到实际上请求了两次,第一次就是521错误,第二次才是200的正确返回。
看下521错误的返回
<html><body><script language = "javascript">
window.onload = setTimeout("fp(38)", 200);
function fp(PD) {
var qo, mo = "",
no = "",
oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0xdb, 0xfd, 0x00, 0x81, 0x62, 0x7f, 0x02, 0x64, 0x25, 0x28, 0x30, 0x50, 0x17, 0x80, 0x01, 0x83, 0x4c, 0xbd, 0xc0, 0x67, 0xb0, 0xd1, 0x79, 0xbc, 0x1e, 0x7f, 0xc8, 0xaf, 0x72, 0x80, 0x41, 0x2a, 0xec, 0xf3, 0xdc, 0xe3, 0xc6, 0x6d, 0xb6, 0x1e, 0x28, 0x14, 0xfc, 0xa4, 0x0c, 0xf4, 0xfe, 0x24, 0x4b, 0x2e, 0xb0, 0x71, 0x7b, 0x7e, 0x5f, 0xe7, 0xca, 0x71, 0x7b, 0xb2, 0x3a, 0xe1, 0xea, 0x91, 0xb4, 0xd4, 0x7c, 0x43, 0xcc, 0x4d, 0x10, 0x88, 0x49, 0xf1, 0xfb, 0x84, 0x8b, 0x6f, 0x98, 0xbf, 0xe8, 0x6f, 0xf8, 0x69, 0x52, 0x79, 0x82, 0x4a, 0xf4, 0x95, 0x98, 0x38, 0xd8, 0x5a, 0x39, 0xeb, 0xee, 0xad, 0xb5, 0xb3, 0xf5, 0x5a, 0xbc, 0xe6, 0x8f, 0x18, 0xc0, 0x58, 0x62, 0x7f, 0x82, 0x43, 0x4d, 0x2d, 0xb5, 0x5d, 0x08, 0xb0, 0xb7, 0x8a, 0x72, 0x59, 0x02, 0xe9, 0x11, 0x55, 0x5e, 0x07, 0x46, 0x44, 0x5f, 0xbf, 0x87, 0x32, 0x53, 0x95, 0x97, 0xa5, 0x28, 0xa9, 0x0c, 0x14, 0x7e, 0x87, 0xf0, 0x58, 0xd9, 0x7a, 0xe4, 0xa4, 0x65, 0xc7, 0x8f, 0x3a, 0xda, 0x00, 0x81, 0xa2, 0x44, 0x4c, 0x96, 0x70, 0xb9, 0x81, 0x2c, 0x6b, 0xee, 0x8e, 0xf0, 0x12, 0x54, 0x93, 0xfd, 0x3f, 0x62, 0xa4, 0x65, 0x66, 0x68, 0xbb, 0xbe, 0x1f, 0x40, 0xaa, 0xcc, 0x1a, 0x5c, 0x1d, 0x9f, 0xc0, 0x42, 0x92, 0xd4, 0x35, 0x3d, 0x60, 0xa2, 0xac, 0x6d, 0x6e, 0x70, 0x71, 0xd3, 0xc6, 0xe5, 0x50, 0x4b, 0x28, 0x46, 0xfb, 0x3b];
qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";
eval(qo);
qo = 233;
do {
oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;
} while (--qo >= 3);
qo = 1;
for(;;) {
if(qo > 233) break;
oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff) >> 4);
qo++;
}
po = "";
for(qo = 1; qo < oo.length - 1; qo++)
if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);
eval("qo=eval;qo(po);");
}
</script> </body></html>
一堆和密码一样的js代码,不管他,看最后一句
eval("qo=eval;qo(po);");
这句绕来绕去实际上就是eval(po),提取出这个方法
function fp(PD) {
var qo, mo = "",
no = "",
oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0xdb, 0xfd, 0x00, 0x81, 0x62, 0x7f, 0x02, 0x64, 0x25, 0x28, 0x30, 0x50, 0x17, 0x80, 0x01, 0x83, 0x4c, 0xbd, 0xc0, 0x67, 0xb0, 0xd1, 0x79, 0xbc, 0x1e, 0x7f, 0xc8, 0xaf, 0x72, 0x80, 0x41, 0x2a, 0xec, 0xf3, 0xdc, 0xe3, 0xc6, 0x6d, 0xb6, 0x1e, 0x28, 0x14, 0xfc, 0xa4, 0x0c, 0xf4, 0xfe, 0x24, 0x4b, 0x2e, 0xb0, 0x71, 0x7b, 0x7e, 0x5f, 0xe7, 0xca, 0x71, 0x7b, 0xb2, 0x3a, 0xe1, 0xea, 0x91, 0xb4, 0xd4, 0x7c, 0x43, 0xcc, 0x4d, 0x10, 0x88, 0x49, 0xf1, 0xfb, 0x84, 0x8b, 0x6f, 0x98, 0xbf, 0xe8, 0x6f, 0xf8, 0x69, 0x52, 0x79, 0x82, 0x4a, 0xf4, 0x95, 0x98, 0x38, 0xd8, 0x5a, 0x39, 0xeb, 0xee, 0xad, 0xb5, 0xb3, 0xf5, 0x5a, 0xbc, 0xe6, 0x8f, 0x18, 0xc0, 0x58, 0x62, 0x7f, 0x82, 0x43, 0x4d, 0x2d, 0xb5, 0x5d, 0x08, 0xb0, 0xb7, 0x8a, 0x72, 0x59, 0x02, 0xe9, 0x11, 0x55, 0x5e, 0x07, 0x46, 0x44, 0x5f, 0xbf, 0x87, 0x32, 0x53, 0x95, 0x97, 0xa5, 0x28, 0xa9, 0x0c, 0x14, 0x7e, 0x87, 0xf0, 0x58, 0xd9, 0x7a, 0xe4, 0xa4, 0x65, 0xc7, 0x8f, 0x3a, 0xda, 0x00, 0x81, 0xa2, 0x44, 0x4c, 0x96, 0x70, 0xb9, 0x81, 0x2c, 0x6b, 0xee, 0x8e, 0xf0, 0x12, 0x54, 0x93, 0xfd, 0x3f, 0x62, 0xa4, 0x65, 0x66, 0x68, 0xbb, 0xbe, 0x1f, 0x40, 0xaa, 0xcc, 0x1a, 0x5c, 0x1d, 0x9f, 0xc0, 0x42, 0x92, 0xd4, 0x35, 0x3d, 0x60, 0xa2, 0xac, 0x6d, 0x6e, 0x70, 0x71, 0xd3, 0xc6, 0xe5, 0x50, 0x4b, 0x28, 0x46, 0xfb, 0x3b];
qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";
eval(qo);
qo = 233;
do {
oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;
} while (--qo >= 3);
qo = 1;
for(;;) {
if(qo > 233) break;
oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff) >> 4);
qo++;
}
po = "";
for(qo = 1; qo < oo.length - 1; qo++)
if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);
return po;
}
fp(38);
把它放到浏览器中执行下
"document.cookie='_ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839acc0-1544419245; expires=Mon, 10-Dec-18 05:20:45 GMT; domain=.66ip.cn; path=/'; window.document.location=document.URL"
_ydclearance终于出来了,现在只要把_ydclearance和yd_cookie拼起来set到Cookie中就可以了。
JAVA代码
public class HandleCrawler {
public static void setCookie() throws IOException, ScriptException {
CloseableHttpResponse response = ApacheHttpUtil.sendGet(Constant.proxyUrl);
if(response.getStatusLine().getStatusCode()==521) {
String yd_cookie = getYdCookie(response.getAllHeaders());
ProxyRequest.logger.info("yd_cookie is :"+yd_cookie);
HttpEntity entity = response.getEntity();
String html=EntityUtils.toString(entity,"utf-8");
String runString = getRunString(html);
String fuction = html.substring(html.indexOf("function")).replace("</script> </body></html>",runString+";").replace("eval(\"qo=eval;qo(po);\")","return po");
ProxyRequest.logger.info("fuction is :"+fuction);
ScriptEngineManager m = new ScriptEngineManager(); //获取JavaScript执行引擎
ScriptEngine engine = m.getEngineByName("JavaScript"); //执行JavaScript代码
String origin = (String) engine.eval(fuction);
ProxyRequest.logger.info("origin ydclearance is :"+origin);
String ydclearance = getYdclearance(origin);
ProxyRequest.logger.info("ydclearance is :"+ydclearance);
Constant.COOKIE = "yd_cookie="+yd_cookie+"; _ydclearance="+ydclearance;
}
}
private static String getYdCookie(Header[] headers){
String yd_cookie = null;
for(Header header:headers){
if (header.getName().equals("Set-Cookie")){
yd_cookie = header.getValue();
}
}
Pattern pattern = Pattern.compile("(?<=yd_cookie=).+?(?=; Expires=)");
Matcher matcher = pattern.matcher(yd_cookie);
while (matcher.find()){
yd_cookie = matcher.group(0);
}
return yd_cookie;
}
private static String getYdclearance(String origin){
String ydclearance = null;
Pattern pattern = Pattern.compile("(?<=_ydclearance=).+?(?=; expires=)");
Matcher matcher = pattern.matcher(origin);
while (matcher.find()){
ydclearance = matcher.group(0);
}
return ydclearance;
}
private static String getRunString(String html){
Pattern pattern = Pattern.compile("(?<=window.onload=setTimeout\\(\").+?(?=\", 200\\))");
Matcher matcher = pattern.matcher(html);
while (matcher.find()){
return matcher.group(0);
}
return null;
}
}