由于所爬取的网站需要验证码,通过网页的开发人员工具【f12】及在线http post,get接口测试请求工具(http://coolaf.com/)发现访问时加上请求头header 信息时可以跳过验证码校验。
而且该网站只接受post请求,对提交的参数也只接受json格式,否则请求失败。
现将通过 post 方式提交json参数的方法记录如下:
import java.io.unsupportedencodingexception;
import java.net.uri;
import java.net.urldecoder;
import java.util.arraylist;
import java.util.list;
import org.apache.http.httpentity;
import org.apache.http.httpresponse;
import org.apache.http.client.httpclient;
import org.apache.http.client.config.requestconfig;
import org.apache.http.client.methods.httppost;
import org.apache.http.client.methods.httprequestbase;
import org.apache.http.client.utils.uribuilder;
import org.apache.http.entity.stringentity;
import org.apache.http.impl.client.closeablehttpclient;
import org.apache.http.impl.client.httpclientbuilder;
import org.apache.http.impl.client.httpclients;
import org.apache.http.util.entityutils;
import com.alibaba.fastjson.jsonarray;
import com.alibaba.fastjson.jsonobject;
/**
*
@postjsonparamstest.java
* @version 1.0
* @author zxk
* @date 2018-3-3
*/
public class postjsonparamstest {
// 超时时间
private static final int run_time =10000;
// 爬取初始页数
private string page;
public static void main(string[] args) throws exception {
postjsonparamstest crawl = new postjsonparamstest();
// 请求的url地址
string url ="http://www.gzcredit.gov.cn/service/creditservice.asmx/searchorgwithpage";
// 设置起始访问页码
crawl.setpage("1");
string isstop = "";
// 设置请求
httprequestbase request = null;
request = new httppost(url);
try {
// 设置config
requestconfig requestconfig = requestconfig.custom()
.setsockettimeout(run_time)
.setconnecttimeout(run_time)
.setconnectionrequesttimeout(run_time)
.build();
request.setconfig(requestconfig);
// json 格式的 post 参数
string postparams ="{\"condition\":{\"qymc\":\"%%%%\",\"cydw\":\"\"},\"pageno\":"+crawl.getpage()+",\"pagesize\":100,count:2709846}";
system.out.println(postparams);
httpentity httpentity = new stringentity(postparams);
((httppost) request).setentity(httpentity);
// 添加请求头,可以绕过验证码
request.addheader("accept","application/json, text/javascript, */*");
request.addheader("accept-encoding","gzip, deflate");
request.addheader("accept-language", "zh-cn,zh;q=0.8");
request.addheader("connection", "keep-alive");
request.addheader("host", "www.gzcredit.gov.cn");
request.addheader("content-type", "application/json; charset=utf-8");
uribuilder builder = new uribuilder(url);
uri uri = builder.build();
uri = new uri(urldecoder.decode(uri.tostring(), "utf-8"));
request.seturi(uri);
while(!isstop.equals("停止")||isstop.equals("重跑")){
isstop = crawl.crawllist(request);
if(isstop.equals("爬取")){
crawl.setpage(string.valueof(integer.parseint(crawl.getpage())+1));
}
// if("2713".equals(crawl.getpage())) break;
if("2".equals(crawl.getpage())){
break;
}
}
} catch (numberformatexception e) {
e.printstacktrace();
throw new numberformatexception("数字格式错误");
} catch (unsupportedencodingexception e) {
e.printstacktrace();
throw new unsupportedencodingexception("不支持的编码集");
}
}
/**
* 爬取搜索列表
* @param page
* @return
*/
private string crawllist(httprequestbase request){
int statuscode = 0;
// 下面两种方式都可以用来创建客户端连接,相当于打开了一个浏览器
closeablehttpclient httpclient = httpclients.createdefault();
// httpclient httpclient = httpclientbuilder.create().build();
httpentity httpentity = null;
httpresponse response = null;
try {
try {
response = httpclient.execute(request);
} catch (exception e){
e.printstacktrace();
entityutils.consumequietly(httpentity);
return "重跑";
}
//打印状态
statuscode =response.getstatusline().getstatuscode();
if(statuscode!=200){
entityutils.consumequietly(httpentity);
return "重跑";
}
//实体
httpentity = response.getentity();
string searchliststr = entityutils.tostring(httpentity,"gbk").replaceall("\\\\米", "米");
string alldata = (string) jsonobject.parseobject(searchliststr).get("d");
// 字符串值中间含双引号的替换处理
string s = alldata.replaceall("\\{\"","{'")
.replaceall("\":\"", "':'")
.replaceall("\",\"", "','")
.replaceall("\":", "':")
.replaceall(",\"", ",'")
.replaceall("\"\\}", "'}")
.replaceall("\"", "")
.replaceall("'", "\"")
.replaceall("
", "")
.replaceall("\t", "")
.replaceall("\\\\", "?");
jsonobject jsondata = jsonobject.parseobject(s);
jsonarray jsoncontent = jsondata.getjsonarray("orglist");
searchliststr = null;
alldata = null;
s = null;
if (jsoncontent==null || jsoncontent.size()<1) {
return "重跑";
}
system.out.println(jsoncontent.tojsonstring());
return "爬取";
} catch (exception e) {
e.printstacktrace();
return "重跑";
} finally{
entityutils.consumequietly(httpentity);
}
}
private string getpage() {
return page;
}
private void setpage(string page) {
this.page = page;
}
}
补充知识:java利用httpclient发送post请求,将请求数据放到body里
我就废话不多说了,大家还是直接看代码吧~
/**
* post请求 ,请求数据放到body里
* @param url 请求地址
* @param bodydata 参数
* @author wangyj
* @date 2019年4月20日
*/
public static string dopostbodydata(string url, string bodydata) throws exception{
string result = "";
closeablehttpclient httpclient = null;
closeablehttpresponse response = null;
try {
httppost httppost = gethttppost(url, null); // 请求地址
httppost.setentity(new stringentity(bodydata, encoding));
httpclient = gethttpclient();
// 得到返回的response
response = httpclient.execute(httppost);
httpentity entity = response.getentity();
result = getresult(entity, encoding);
} catch (exception e) {
throw e;
} finally {
// 关闭httpclient
if (null != httpclient) {
httpclient.close();
}
// 关闭response
if (null != response) {
entityutils.consume(response.getentity()); // 会自动释放连接
response.close();
}
}
return result;
}
以上这篇java 实现通过 post 方式提交json参数操作就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持萬仟网。
如您对本文有疑问或者有任何想说的,请点击进行留言回复,万千网友为您解惑!