案例链接:http://ggzy.ah.gov.cn/login.do;jsessionid=98f09afb1805eeeb792d6c613213?method=beginlogin
如上图,可以看出该网站响应的数据 为DWR的数据。
将爬取的步骤分成两步:
第一步:获取响应的DWR数据
public static void main(String[] args) {
paras= "callCount=1\n" + //参数有规矩的变化 。这里只写第一页的
"page=/bulletininfo.do?method=showList&fileType=1&hySort=&bulletinclass=jy&num=1\n" +
"httpSessionId=21892ef6b566a6ef13de8a1af762\n" +
"scriptSessionId=7AD45C0E3AC509B852649968C03C9FB6687\n" +
"c0-scriptName=bulletinInfoDWR\n" +
"c0-methodName=getPackListForDwr1\n" +
"c0-id=0\n" +
"c0-e1=string:1\n" +
"c0-e2=string:\n" +
"c0-e3=string:jy\n" +
"c0-e4=string:1\n" +
"c0-e5=string:\n" +
"c0-e6=string:\n" +
"c0-e7=string:\n" +
"c0-e8=number:2\n" +
"c0-e9=string:10\n" +
"c0-e10=string:true\n" +
"c0-e11=string:packTable\n" +
"c0-e12=string:71763\n" +
"c0-param0=Object_Object:{id:reference:c0-e1, hySort:reference:c0-e2, bulletinclass:reference:c0-e3, fileType:reference:c0-e4, bulletinType:reference:c0-e5, district:reference:c0-e6, srcdistrict:reference:c0-e7, currentPage:reference:c0-e8, pageSize:reference:c0-e9, isPage:reference:c0-e10, tabId:reference:c0-e11, totalRows:reference:c0-e12}\n" +
"batchId=15\n";
Response execute = Jsoup.connect("http://ggzy.ah.gov.cn/dwr/call/plaincall/bulletinInfoDWR.getPackListForDwr1.dwr")
.header("Host","ggzy.ah.gov.cn")
.header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0")
.header("Accept","*/*")
.header("Accept-Language","zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2")
.header("Accept-Encoding","gzip, deflate")
.header("Connection","keep-alive")
.header("Content-Length",paras.length()+"")
.header("Content-Type","text/plain")
.header("Cookie",cookie)
.header("Upgrade-Insecure-Requests","1")
.header("X-Requested-With","XMLHttpRequest")
.header("Referer","http://ggzy.ah.gov.cn/bulletininfo.do?method=showList&fileType=1&hySort=&bulletinclass=jy&num=1")
.requestBody(paras)
.followRedirects(false)
.ignoreContentType(true)
.timeout(1000*10)
.method(Method.POST)
.execute();
Response response = getListParseConnection( tweb, proxy)
.method(Method.POST)
.execute();
String responseStr=response.body();
responseStr=DwrToJsonByJs(responseStr); // 引用第二步方法
}
第二步:将 得到的DWR数据经行转化。
//处理dwr数据
public static String DwrToJsonByJs(String jsStr){
jsStr=jsStr.replaceAll("\r", "")
.replaceAll("\n", "")
.replaceAll("\t", "")
.replaceAll("\f", "")
.replaceAll("\"", "")
.replaceAll(" ", "").replaceAll("\\.\\.\\.", "");
String tps[]=jsStr.split("#DWR-REPLY");
if(tps.length>1){
jsStr=tps[1];
}
if(jsStr.indexOf("dwr.engine")!=-1){
jsStr=jsStr.split("dwr\\.engine")[0];
}
jsStr=jsStr.replaceAll("\\['", ".").replaceAll("'\\]", "");
String arys[]=jsStr.split(";");
Map<String,Map<String,String> > m_json=new HashMap< String,Map<String,String> >();
//获取KEY值
for(int i=0;i<arys.length;i++){
if(arys[i].indexOf("var")!=-1){
String kk=arys[i].replaceAll("var", "").replaceAll("=", "").replaceAll("\\{", "").replaceAll("\\}", "");
m_json.put(kk, null);
}
}
//获取KEY值 对应的VALUE
for(int i=0;i<arys.length;i++){
if(arys[i].indexOf("var")==-1){
String objkey=arys[i].split("\\.")[0];
String objvalue=arys[i].replace(objkey+".","");
String kv[]=objvalue.split("\\=");
/*if(objkey.equals("s9")){
System.out.println(kv[0]+"="+kv[1]);
}*/
if(m_json.containsKey(objkey)){
if(m_json.get(objkey)!=null){
Map<String,String> mm=m_json.get(objkey);
mm.put(kv[0], kv[1]);
m_json.put(objkey, mm);
}else{
Map<String,String> mm=new HashMap<String,String>();
mm.put(kv[0], kv[1]);
m_json.put(objkey, mm);
}
}else{//不包含
}
}
}
Collection< Map<String,String> > c=m_json.values();
return JSON.toJSONString(c);
}