Java爬虫（4）——HttpGet vs HttpPost

最新推荐文章于 2023-07-31 15:43:37 发布

乔的天然呆

最新推荐文章于 2023-07-31 15:43:37 发布

阅读量808

点赞数

分类专栏： java 文章标签： java 爬虫

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/EQ___/article/details/53151103

版权

java 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

这期继续深入讲获取“加载更多”内容。

1. HttpClient用HttpGet和HttpPost这两个类来分别处理get和post请求。

无论是使用HttpGet，还是使用HttpPost，都必须通过如下3步来访问HTTP资源。

1.创建HttpGet或HttpPost对象，将要请求的URL通过构造方法传入HttpGet或HttpPost对象。

2.使用DefaultHttpClient类的execute方法发送HTTP GET或HTTP POST请求，并返回HttpResponse对象。

3.通过HttpResponse接口的getEntity方法返回响应信息，并进行相应的处理。

如果使用HttpPost方法提交HTTP POST请求，则需要使用HttpPost类的setEntity方法设置请求参数。参数则必须用NameValuePair[]数组存储。

2.chrome控制台查看请求

以下是两个不同的网站点击“加载更多”按钮出现的请求，分别是get方式和post方式。

（1）

（2）

观察黄线画出的内容，里面包含我们所需要的超链接，因此我们可以从响应数据中获得我们所需的内容。由于其中包含转义字符，用Gson()来处理比较方便，继而再用jsoup解析数据，获得需要的超链接。

代码如下：

packageedu.nju.opsource.nhandan;

importjava.io.IOException;

importjava.io.UnsupportedEncodingException;

importjava.util.ArrayList;

importjava.util.List;

importorg.apache.http.HttpEntity;

importorg.apache.http.HttpResponse;

importorg.apache.http.HttpStatus;

importorg.apache.http.NameValuePair;

importorg.apache.http.client.ClientProtocolException;

importorg.apache.http.client.config.RequestConfig;

importorg.apache.http.client.entity.UrlEncodedFormEntity;

importorg.apache.http.client.methods.HttpGet;

importorg.apache.http.client.methods.HttpPost;

importorg.apache.http.impl.client.CloseableHttpClient;

importorg.apache.http.impl.client.HttpClientBuilder;

importorg.apache.http.message.BasicNameValuePair;

importorg.apache.http.protocol.HTTP;

importorg.apache.http.util.EntityUtils;

importorg.jsoup.Jsoup;

importorg.jsoup.nodes.Document;

importorg.jsoup.nodes.Element;

importorg.jsoup.select.Elements;

importcom.google.gson.Gson;

public classCrawler {

public static final int SOCKETTIMEOUT =30000;

public static final int CONNECTTIMEOUT =30000;

privateString encode="utf-8";

privateString url;

privateCloseableHttpClient client = HttpClientBuilder.create().build();

private HttpGet httpGet;

private HttpPost httpPost;

privateString pageContent = null;

privateDocument doc = null;

publicString getPageContent() {

return pageContent;

}

public voidsetPageContent(String pageContent) {

this.pageContent = pageContent;

}

publicDocument getDoc() {

return doc;

}

public voidsetDoc(Document doc) {

this.doc = doc;

}

publicCrawler(String url) {

super();

this.url = url;

}

public HttpGet getNormalHttpGet(){

HttpGet httpGet = newHttpGet(this.url);

httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;");

httpGet.addHeader("Accept-Language", "zh-cn");

httpGet.addHeader("User-Agent", "Mozilla/5.0(Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.3) Gecko/2008092417Firefox/3.0.3");

RequestConfig requestConfig =RequestConfig.custom().setSocketTimeout(SOCKETTIMEOUT).setConnectTimeout(CONNECTTIMEOUT).build();

httpGet.setConfig(requestConfig);

return httpGet;

}

public HttpPost getNormalHttpPost(){

HttpPost httpPost = newHttpPost(this.url);//创建HttpPost对象

List <NameValuePair> params = newArrayList<NameValuePair>();

params.add(newBasicNameValuePair("cate_id", "1003894"));

params.add(newBasicNameValuePair("page", "2"));

try {

httpPost.setEntity(newUrlEncodedFormEntity(params,HTTP.~~UTF_8~~));

} catch(UnsupportedEncodingException e) {

// TODOAuto-generated catch block

e.printStackTrace();

}

return httpPost;

}

privateString convertToThisCharset(String rst){

try{

String tmp = new String( rst.getBytes(this.encode), "utf-8");

return tmp;

}catch (Exception e){

}

return null;

}

publicString runByGet(){

HttpResponse response = null;

httpGet=getNormalHttpGet();

try {

response = client.execute(httpGet);

if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK) {

HttpEntity entity = response.getEntity();

String respContent =EntityUtils.toString(entity, this.encode).trim();

if( !this.encode.equals("UTF-8") )

respContent =convertToThisCharset( respContent );

return respContent;

}

} catch (ClientProtocolException e) {

e.printStackTrace();

} catch (IOException e){

e.printStackTrace();

} finally{

}

return null;

}

publicString runByPost(){

HttpResponse response = null;

httpPost=getNormalHttpPost();

try {

response = client.execute(httpPost);

if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK) {

HttpEntity entity = response.getEntity();

String respContent =EntityUtils.toString(entity, this.encode).trim();

if( !this.encode.equals("UTF-8") )

respContent =convertToThisCharset( respContent );

return respContent;

}

} catch (ClientProtocolException e) {

e.printStackTrace();

} catch (IOException e){

e.printStackTrace();

} finally{

}

return null;

}

public voidget(String url)//赋值给this.doc

{

this.pageContent =runByGet();

if (this.pageContent != null){

this.doc =Jsoup.parse(this.pageContent);

}

else{

System.out.println(" ... crawled failed.");

}

}

public static voidtestHttpPost(){

String url="http://e.vnexpress.net/category/loadmorenews";

Crawler c=newCrawler(url);

c.setPageContent(c.runByPost()) ;

String resData=c.getPageContent();

System.out.println(resData);

Gson gson=newGson();

InnerResponseByHttpPost res=gson.fromJson(resData,InnerResponseByHttpPost.class);

System.out.println(res);

Document resDoc=Jsoup.parse(res.getMessage());

System.out.println(resDoc);

Elements hrefs = resDoc.select("h4.title_news_sitea");

for(Elemente : hrefs){

String href=e.attr("abs:href");

System.out.println(href);

}

}

public static voidTestHttpGet(){

String url="http://thestandard.com.ph/api/category/json?page=3&category=1&column=0&totItems=40500&currentItems=16";

Crawler c=newCrawler(url);

c.get(url);

String resData=c.getPageContent();

Gson gson=newGson();

InnerResponseByHttpGet res=gson.fromJson(resData,InnerResponseByHttpGet.class);

System.out.println(res);

Document resDoc=Jsoup.parse(res.getData());

System.out.println(resDoc);

Elements hrefs = resDoc.select("div.img-container-masonrya");

for(Elemente : hrefs){

String href=e.attr("abs:href");

System.out.println(href);

}

}

public static voidmain(String []args){

testHttpPost();

}

//内部类，对应response数据，用于Gson解析。

public classInnerResponseByHttpGet{

privateString type;

private boolean lastpage;

privateString data;

publicString getData() {

return data;

}

public voidsetData(String data) {

this.data = data;

}

@Override

publicString toString() {

return "InnerResponse[type=" + type + ",lastpage=" + lastpage + ",data=" + data + "]";

}

}

public classInnerResponseByHttpPost{

private int error;

privateString message;

private boolean end;

publicString getMessage() {

return message;

}

public voidsetMessage(String message) {

this.message = message;

}

}

}

testHttpPost()运行结果如下：

还是上期使用的那个url,上期的拼接可以获得13个url，这样访问可以获得和点击按钮一样的15个。

testHttpGet()同样正常获取：

乔的天然呆

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。