工作任务:
今天老大让我跑取一个新闻网站:https://www.yidaiyilu.gov.cn/
采坑记录:
- https协议,如果利用http协议去请求会报出如下信息:
错误:SSLHandshake错误就知道了,客户端与服务端进行连接时,需要通过SSL协议进行握手
(坑)改用:重写DefaultHttpClient方法使其支持SSL协议
package httpsParse;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
//用于进行Https请求的HttpClient
public class SSLClient extends DefaultHttpClient{
public SSLClient() throws Exception{
super();
//传输协议需要根据自己的判断
SSLContext ctx = SSLContext.getInstance("TLSv1.2");
X509TrustManager tm = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain,
String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain,
String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
ctx.init(null, new TrustManager[]{tm}, null);
SSLSocketFactory ssf = new SSLSocketFactory(ctx,SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
ClientConnectionManager ccm = this.getConnectionManager();
SchemeRegistry sr = ccm.getSchemeRegistry();
sr.register(new Scheme("https", 443, ssf));
}
}
(坑)然后再利用HttpClient去请求获取网页源代码:
public static void main(String[] args) throws Exception {
HttpClientUtil httpClientUtil = new HttpClientUtil();
String url = "https://www.yidaiyilu.gov.cn/zchj.htm";
String html = httpClientUtil.doGet(url);
System.out.println(html);
}
最后发现得到的结果:是一段js代码
<script>var x="@catch@@@d@@toString@@String@@36@pathname@if@@toLowerCase@var@855@captcha@@Array@@@1@@for@1500@@document@@@@chars@attachEvent@addEventListener@substr@Expires@@false@f@0@fromCharCode@innerHTML@@@@8@@@@@@@split@parseInt@createElement@g@new@16@search@May@@https@@reverse@@RegExp@@while@@@charCodeAt@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@@10@JgSe0upZ@else@match@0xFF@@@07@length@@e@eval@@@19@@@Path@a@div@setTimeout@cookie@3@5@@0xEDB88320@@GMT@challenge@@@Tue@@@window@@href@return@try@@@@@location@onreadystatechange@function@1557242170@DOMContentLoaded@@firstChild@replace@__jsl_clearance@charAt@join@".replace(/@*$/,"").split("@"),y="g 3b=3q(){31('3o.3h=3o.c+3o.1s.40(/[\\?|&]i-39/,\\'\\')',q);s.32='41=3r.h|19|'+(3q(){g 1i=[3q(3b){3i 2n('9.1a('+3b+')')},(3q(){g 3b=s.1o('30');3b.1b='<2u 3h=\\'/\\'>3l</2u>';3b=3b.3u.3h;g 1i=3b.2f(/20?:\\/\\//)[19];3b=3b.14(1i.2k).f();3i 3q(1i){p(g 3l=19;3l<1i.2k;3l++){1i[3l]=3b.42(1i[3l])};3i 1i.43('')}})()],3l=[[([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+[-~~~!{}+[~~[]]-(-~~~!{})],(-~{}+[]+[[]][19])+[~~'']+[-~(+!+{})],[34]+(-~[-~{}-~{}]+[[]][19]),[-~{}-~[-~{}-~{}]]+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),(-~{}+[]+[[]][19])+(-~{}+[]+[[]][19])+[-~(+!+{})],(-~{}+[]+[[]][19])+(-~{}+[]+[[]][19])+[-~{}-~[-~{}-~{}]],[33-~(+!+{})-~(+!+{})]+(-~[-~{}-~{}]+[[]][19]),[34]+[-~(+!+{})],[-~~~!{}+[~~[]]-(-~~~!{})]+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),[33-~(+!+{})-~(+!+{})]+(-~[-~{}-~{}]+[[]][19]),(-~{}+[]+[[]][19])+[~~'']+[33-~(+!+{})-~(+!+{})]],[(-~{}+[]+[[]][19])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),[33-~(+!+{})-~(+!+{})]],[[34]+[-~{}-~[-~{}-~{}]],(-~[-~{}-~{}]+[[]][19])+[33-~(+!+{})-~(+!+{})],[34]+[~~''],([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[]),([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),(-~{}+[]+[[]][19])+(-~{}+[]+[[]][19])+[34]],[(-~{}+[]+[[]][19])+[-~(+!+{})],([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])],[[34]+(-~{}+[]+[[]][19]),(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19])+[~~''],[34]+[34],[34]+([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[]),[-~{}-~[-~{}-~{}]]+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19])],[(-~{}+[]+[[]][19])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),(-~{}+[]+[[]][19])+[-~(+!+{})]],[([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+[-~~~!{}+[~~[]]-(-~~~!{})],(-~[-~{}-~{}]+[[]][19])+[33-~(+!+{})-~(+!+{})],[34]+(-~{}+[]+[[]][19]),([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19])]];p(g 3b=19;3b<3l.2k;3b++){3l[3b]=1i.22()[(-~{}+[]+[[]][19])](3l[3b])};3i 3l.43('')})()+';15=3c, 2j-1t-2q 1r:1r:2c 38;2t=/;'};d((3q(){3j{3i !!3f.13;}2(2m){3i 17;}})()){s.13('3s',3b,17)}2e{s.12('3p',3b)}",f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},z=f(y.match(/\w/g).sort(function(x,y){return f(x)-f(y)}).pop());while(z++)try{eval(y.replace(/\b\w+\b/g, function(y){return x[f(y,z)-1]||("_"+y)}));break}catch(_){}</script>
- 开始怀疑是cookie的原因,然后在浏览器中将cookie带上去最后请求出结果,但是cookie是有有效期的,隔一段时间cookie就失效了,因此这种方法行不通
- 后来分析发现浏览器访问该网站首先会加载js然后生成cookie,再将这次生成的cookie带上请求头再次请求,所以为什么第一次上面的请求会出现js代码,但是js是动态加载的,因此需要利用java模拟浏览的方式去实现
- 最终通过htmlunit实现的代码:
package cn.server;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
public class GFDynamicWeb {
public static HtmlUnitDriver driver = new HtmlUnitDriver();
public static boolean isGetCookie = false;
// public static boolean isRepeatExec = false;
public static String GetContent(String url) {
if(!isGetCookie) {
driver.setJavascriptEnabled(true);
//第一次加载js获取cookie
driver.get(url);
}
driver.setJavascriptEnabled(false);
//第二次加载网页源码
driver.get(url);
String pageSource = driver.getPageSource();
isGetCookie = true;
return pageSource;
}
public static void renewIsGetCookie() {
isGetCookie = false;
}
public static void closeDriver() {
driver.close();
}
public static void main(String[] args) {
long s = System.currentTimeMillis();
for(int i = 0; i < 100; i ++) {
String url = "https://www.yidaiyilu.gov.cn/";
String content = GetContent(url);
System.out.println(content);
}
long e = System.currentTimeMillis();
System.out.println((e - s)/1000 + "秒");
renewIsGetCookie();
closeDriver();
}
}
期间利用的网址: