java做爬虫解决521错误

欢迎访问github

 

最近做爬虫时碰到了521错误,500开头的都是服务器错误,521错误有很大可能是请求头参数不对,比如下面这个

这是错误的

这是正确的

就是这一堆东西可能有哪些少了或是错误,而在爬虫中遇到这个错误,又极大可能是少了Cookie参数,比如在这里就是少了Cookie参数。

展开那个Cookie

Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; _ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839acc0-1544419245

可以看到就是两个字段,一个yd_cookie,还有一个_ydclearance。那不就是少这两个参数嘛,yd_cookie可以看到在第一次错误的响应头中已经有了

Set-Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; Expires=1544419245; Path=/; HttpOnly

那只要正则截取下及好了,这样就只剩下一个_ydclearance,这个怎么解决呢,首先用各种拦截请求的工具,我是用的Firefox,直接看请求

可以看到实际上请求了两次,第一次就是521错误,第二次才是200的正确返回。

看下521错误的返回

<html><body><script language = "javascript">
window.onload = setTimeout("fp(38)", 200);
function fp(PD) {
	var qo, mo = "",
		no = "",
		oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0xdb, 0xfd, 0x00, 0x81, 0x62, 0x7f, 0x02, 0x64, 0x25, 0x28, 0x30, 0x50, 0x17, 0x80, 0x01, 0x83, 0x4c, 0xbd, 0xc0, 0x67, 0xb0, 0xd1, 0x79, 0xbc, 0x1e, 0x7f, 0xc8, 0xaf, 0x72, 0x80, 0x41, 0x2a, 0xec, 0xf3, 0xdc, 0xe3, 0xc6, 0x6d, 0xb6, 0x1e, 0x28, 0x14, 0xfc, 0xa4, 0x0c, 0xf4, 0xfe, 0x24, 0x4b, 0x2e, 0xb0, 0x71, 0x7b, 0x7e, 0x5f, 0xe7, 0xca, 0x71, 0x7b, 0xb2, 0x3a, 0xe1, 0xea, 0x91, 0xb4, 0xd4, 0x7c, 0x43, 0xcc, 0x4d, 0x10, 0x88, 0x49, 0xf1, 0xfb, 0x84, 0x8b, 0x6f, 0x98, 0xbf, 0xe8, 0x6f, 0xf8, 0x69, 0x52, 0x79, 0x82, 0x4a, 0xf4, 0x95, 0x98, 0x38, 0xd8, 0x5a, 0x39, 0xeb, 0xee, 0xad, 0xb5, 0xb3, 0xf5, 0x5a, 0xbc, 0xe6, 0x8f, 0x18, 0xc0, 0x58, 0x62, 0x7f, 0x82, 0x43, 0x4d, 0x2d, 0xb5, 0x5d, 0x08, 0xb0, 0xb7, 0x8a, 0x72, 0x59, 0x02, 0xe9, 0x11, 0x55, 0x5e, 0x07, 0x46, 0x44, 0x5f, 0xbf, 0x87, 0x32, 0x53, 0x95, 0x97, 0xa5, 0x28, 0xa9, 0x0c, 0x14, 0x7e, 0x87, 0xf0, 0x58, 0xd9, 0x7a, 0xe4, 0xa4, 0x65, 0xc7, 0x8f, 0x3a, 0xda, 0x00, 0x81, 0xa2, 0x44, 0x4c, 0x96, 0x70, 0xb9, 0x81, 0x2c, 0x6b, 0xee, 0x8e, 0xf0, 0x12, 0x54, 0x93, 0xfd, 0x3f, 0x62, 0xa4, 0x65, 0x66, 0x68, 0xbb, 0xbe, 0x1f, 0x40, 0xaa, 0xcc, 0x1a, 0x5c, 0x1d, 0x9f, 0xc0, 0x42, 0x92, 0xd4, 0x35, 0x3d, 0x60, 0xa2, 0xac, 0x6d, 0x6e, 0x70, 0x71, 0xd3, 0xc6, 0xe5, 0x50, 0x4b, 0x28, 0x46, 0xfb, 0x3b];
	qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";
	eval(qo);
	qo = 233;
	do {
		oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;
	} while (--qo >= 3);
	qo = 1;
	for(;;) {
		if(qo > 233) break;
		oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff) >> 4);
		qo++;
	}
	po = "";
	for(qo = 1; qo < oo.length - 1; qo++)
		if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);
	eval("qo=eval;qo(po);");
} 
</script> </body></html>

一堆和密码一样的js代码,不管他,看最后一句

eval("qo=eval;qo(po);");

这句绕来绕去实际上就是eval(po),提取出这个方法

function fp(PD) {
	var qo, mo = "",
		no = "",
		oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0xdb, 0xfd, 0x00, 0x81, 0x62, 0x7f, 0x02, 0x64, 0x25, 0x28, 0x30, 0x50, 0x17, 0x80, 0x01, 0x83, 0x4c, 0xbd, 0xc0, 0x67, 0xb0, 0xd1, 0x79, 0xbc, 0x1e, 0x7f, 0xc8, 0xaf, 0x72, 0x80, 0x41, 0x2a, 0xec, 0xf3, 0xdc, 0xe3, 0xc6, 0x6d, 0xb6, 0x1e, 0x28, 0x14, 0xfc, 0xa4, 0x0c, 0xf4, 0xfe, 0x24, 0x4b, 0x2e, 0xb0, 0x71, 0x7b, 0x7e, 0x5f, 0xe7, 0xca, 0x71, 0x7b, 0xb2, 0x3a, 0xe1, 0xea, 0x91, 0xb4, 0xd4, 0x7c, 0x43, 0xcc, 0x4d, 0x10, 0x88, 0x49, 0xf1, 0xfb, 0x84, 0x8b, 0x6f, 0x98, 0xbf, 0xe8, 0x6f, 0xf8, 0x69, 0x52, 0x79, 0x82, 0x4a, 0xf4, 0x95, 0x98, 0x38, 0xd8, 0x5a, 0x39, 0xeb, 0xee, 0xad, 0xb5, 0xb3, 0xf5, 0x5a, 0xbc, 0xe6, 0x8f, 0x18, 0xc0, 0x58, 0x62, 0x7f, 0x82, 0x43, 0x4d, 0x2d, 0xb5, 0x5d, 0x08, 0xb0, 0xb7, 0x8a, 0x72, 0x59, 0x02, 0xe9, 0x11, 0x55, 0x5e, 0x07, 0x46, 0x44, 0x5f, 0xbf, 0x87, 0x32, 0x53, 0x95, 0x97, 0xa5, 0x28, 0xa9, 0x0c, 0x14, 0x7e, 0x87, 0xf0, 0x58, 0xd9, 0x7a, 0xe4, 0xa4, 0x65, 0xc7, 0x8f, 0x3a, 0xda, 0x00, 0x81, 0xa2, 0x44, 0x4c, 0x96, 0x70, 0xb9, 0x81, 0x2c, 0x6b, 0xee, 0x8e, 0xf0, 0x12, 0x54, 0x93, 0xfd, 0x3f, 0x62, 0xa4, 0x65, 0x66, 0x68, 0xbb, 0xbe, 0x1f, 0x40, 0xaa, 0xcc, 0x1a, 0x5c, 0x1d, 0x9f, 0xc0, 0x42, 0x92, 0xd4, 0x35, 0x3d, 0x60, 0xa2, 0xac, 0x6d, 0x6e, 0x70, 0x71, 0xd3, 0xc6, 0xe5, 0x50, 0x4b, 0x28, 0x46, 0xfb, 0x3b];
	qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";
	eval(qo);
	qo = 233;
	do {
		oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;
	} while (--qo >= 3);
	qo = 1;
	for(;;) {
		if(qo > 233) break;
		oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff) >> 4);
		qo++;
	}
	po = "";
	for(qo = 1; qo < oo.length - 1; qo++)
		if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);
	return po;
}
fp(38);

把它放到浏览器中执行下

"document.cookie='_ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839acc0-1544419245; expires=Mon, 10-Dec-18 05:20:45 GMT; domain=.66ip.cn; path=/'; window.document.location=document.URL"

_ydclearance终于出来了,现在只要把_ydclearanceyd_cookie拼起来set到Cookie中就可以了。

 

JAVA代码

public class HandleCrawler {

    public static void setCookie() throws IOException, ScriptException {
        CloseableHttpResponse response = ApacheHttpUtil.sendGet(Constant.proxyUrl);
        if(response.getStatusLine().getStatusCode()==521) {
            String yd_cookie = getYdCookie(response.getAllHeaders());
            ProxyRequest.logger.info("yd_cookie is :"+yd_cookie);

            HttpEntity entity = response.getEntity();
            String html=EntityUtils.toString(entity,"utf-8");
            String runString = getRunString(html);
            String fuction = html.substring(html.indexOf("function")).replace("</script> </body></html>",runString+";").replace("eval(\"qo=eval;qo(po);\")","return po");
            ProxyRequest.logger.info("fuction is :"+fuction);

            ScriptEngineManager m = new ScriptEngineManager(); //获取JavaScript执行引擎
            ScriptEngine engine = m.getEngineByName("JavaScript"); //执行JavaScript代码
            String origin = (String) engine.eval(fuction);
            ProxyRequest.logger.info("origin ydclearance is :"+origin);
            String ydclearance = getYdclearance(origin);
            ProxyRequest.logger.info("ydclearance is :"+ydclearance);

            Constant.COOKIE = "yd_cookie="+yd_cookie+"; _ydclearance="+ydclearance;
        }
    }

    private static String getYdCookie(Header[] headers){
        String yd_cookie = null;
        for(Header header:headers){
            if (header.getName().equals("Set-Cookie")){
                yd_cookie = header.getValue();
            }
        }
        Pattern pattern = Pattern.compile("(?<=yd_cookie=).+?(?=; Expires=)");
        Matcher matcher = pattern.matcher(yd_cookie);
        while (matcher.find()){
            yd_cookie = matcher.group(0);
        }
        return yd_cookie;
    }

    private static String getYdclearance(String origin){
        String ydclearance = null;
        Pattern pattern = Pattern.compile("(?<=_ydclearance=).+?(?=; expires=)");
        Matcher matcher = pattern.matcher(origin);
        while (matcher.find()){
            ydclearance = matcher.group(0);
        }
        return ydclearance;
    }

    private static String getRunString(String html){
        Pattern pattern = Pattern.compile("(?<=window.onload=setTimeout\\(\").+?(?=\", 200\\))");
        Matcher matcher = pattern.matcher(html);
        while (matcher.find()){
            return matcher.group(0);
        }
        return null;
    }
}


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值