这期继续深入讲获取“加载更多”内容。
1. HttpClient用HttpGet和HttpPost这两个类来分别处理get和post请求。
无论是使用HttpGet,还是使用HttpPost,都必须通过如下3步来访问HTTP资源。
1.创建HttpGet或HttpPost对象,将要请求的URL通过构造方法传入HttpGet或HttpPost对象。
2.使用DefaultHttpClient类的execute方法发送HTTP GET或HTTP POST请求,并返回HttpResponse对象。
3.通过HttpResponse接口的getEntity方法返回响应信息,并进行相应的处理。
如果使用HttpPost方法提交HTTP POST请求,则需要使用HttpPost类的setEntity方法设置请求参数。参数则必须用NameValuePair[]数组存储。
2.chrome控制台查看请求
以下是两个不同的网站点击“加载更多”按钮出现的请求,分别是get方式和post方式。
(1)
(2)
观察黄线画出的内容,里面包含我们所需要的超链接,因此我们可以从响应数据中获得我们所需的内容。由于其中包含转义字符,用Gson()来处理比较方便 ,继而再用jsoup解析数据,获得需要的超链接。
代码如下:
packageedu.nju.opsource.nhandan;
importjava.io.IOException;
importjava.io.UnsupportedEncodingException;
importjava.util.ArrayList;
importjava.util.List;
importorg.apache.http.HttpEntity;
importorg.apache.http.HttpResponse;
importorg.apache.http.HttpStatus;
importorg.apache.http.NameValuePair;
importorg.apache.http.client.ClientProtocolException;
importorg.apache.http.client.config.RequestConfig;
importorg.apache.http.client.entity.UrlEncodedFormEntity;
importorg.apache.http.client.methods.HttpGet;
importorg.apache.http.client.methods.HttpPost;
importorg.apache.http.impl.client.CloseableHttpClient;
importorg.apache.http.impl.client.HttpClientBuilder;
importorg.apache.http.message.BasicNameValuePair;
importorg.apache.http.protocol.HTTP;
importorg.apache.http.util.EntityUtils;
importorg.jsoup.Jsoup;
importorg.jsoup.nodes.Document;
importorg.jsoup.nodes.Element;
importorg.jsoup.select.Elements;
importcom.google.gson.Gson;
public classCrawler {
public static final int SOCKETTIMEOUT =30000;
public static final int CONNECTTIMEOUT =30000;
privateString encode="utf-8";
privateString url;
privateCloseableHttpClient client = HttpClientBuilder.create().build();
private HttpGet httpGet;
private HttpPost httpPost;
privateString pageContent = null;
privateDocument doc = null;
publicString getPageContent() {
return pageContent;
}
public voidsetPageContent(String pageContent) {
this.pageContent = pageContent;
}
publicDocument getDoc() {
return doc;
}
public voidsetDoc(Document doc) {
this.doc = doc;
}
publicCrawler(String url) {
super();
this.url = url;
}
public HttpGet getNormalHttpGet(){
HttpGet httpGet = newHttpGet(this.url);
httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;");
httpGet.addHeader("Accept-Language", "zh-cn");
httpGet.addHeader("User-Agent", "Mozilla/5.0(Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.3) Gecko/2008092417Firefox/3.0.3");
RequestConfig requestConfig =RequestConfig.custom().setSocketTimeout(SOCKETTIMEOUT).setConnectTimeout(CONNECTTIMEOUT).build();
httpGet.setConfig(requestConfig);
return httpGet;
}
public HttpPost getNormalHttpPost(){
HttpPost httpPost = newHttpPost(this.url);//创建HttpPost对象
List <NameValuePair> params = newArrayList<NameValuePair>();
params.add(newBasicNameValuePair("cate_id", "1003894"));
params.add(newBasicNameValuePair("page", "2"));
try {
httpPost.setEntity(newUrlEncodedFormEntity(params,HTTP.UTF_8));
} catch(UnsupportedEncodingException e) {
// TODOAuto-generated catch block
e.printStackTrace();
}
return httpPost;
}
privateString convertToThisCharset(String rst){
try{
String tmp = new String( rst.getBytes(this.encode), "utf-8");
return tmp;
}catch (Exception e){
}
return null;
}
publicString runByGet(){
HttpResponse response = null;
httpGet=getNormalHttpGet();
try {
response = client.execute(httpGet);
if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK) {
HttpEntity entity = response.getEntity();
String respContent =EntityUtils.toString(entity, this.encode).trim();
if( !this.encode.equals("UTF-8") )
respContent =convertToThisCharset( respContent );
return respContent;
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e){
e.printStackTrace();
} finally{
}
return null;
}
publicString runByPost(){
HttpResponse response = null;
httpPost=getNormalHttpPost();
try {
response = client.execute(httpPost);
if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK) {
HttpEntity entity = response.getEntity();
String respContent =EntityUtils.toString(entity, this.encode).trim();
if( !this.encode.equals("UTF-8") )
respContent =convertToThisCharset( respContent );
return respContent;
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e){
e.printStackTrace();
} finally{
}
return null;
}
public voidget(String url)//赋值给this.doc
{
this.pageContent =runByGet();
if (this.pageContent != null){
this.doc =Jsoup.parse(this.pageContent);
}
else{
System.out.println(" ... crawled failed.");
}
}
public static voidtestHttpPost(){
String url="http://e.vnexpress.net/category/loadmorenews";
Crawler c=newCrawler(url);
c.setPageContent(c.runByPost()) ;
String resData=c.getPageContent();
System.out.println(resData);
Gson gson=newGson();
InnerResponseByHttpPost res=gson.fromJson(resData,InnerResponseByHttpPost.class);
System.out.println(res);
Document resDoc=Jsoup.parse(res.getMessage());
System.out.println(resDoc);
Elements hrefs = resDoc.select("h4.title_news_sitea");
for(Elemente : hrefs){
String href=e.attr("abs:href");
System.out.println(href);
}
}
public static voidTestHttpGet(){
String url="http://thestandard.com.ph/api/category/json?page=3&category=1&column=0&totItems=40500¤tItems=16";
Crawler c=newCrawler(url);
c.get(url);
String resData=c.getPageContent();
Gson gson=newGson();
InnerResponseByHttpGet res=gson.fromJson(resData,InnerResponseByHttpGet.class);
System.out.println(res);
Document resDoc=Jsoup.parse(res.getData());
System.out.println(resDoc);
Elements hrefs = resDoc.select("div.img-container-masonrya");
for(Elemente : hrefs){
String href=e.attr("abs:href");
System.out.println(href);
}
}
public static voidmain(String []args){
testHttpPost();
}
//内部类,对应response数据,用于Gson解析。
public classInnerResponseByHttpGet{
privateString type;
private boolean lastpage;
privateString data;
publicString getData() {
return data;
}
public voidsetData(String data) {
this.data = data;
}
@Override
publicString toString() {
return "InnerResponse[type=" + type + ",lastpage=" + lastpage + ",data=" + data + "]";
}
}
public classInnerResponseByHttpPost{
private int error;
privateString message;
private boolean end;
publicString getMessage() {
return message;
}
public voidsetMessage(String message) {
this.message = message;
}
}
}
testHttpPost()运行结果如下:
还是上期使用的那个url,上期的拼接可以获得13个url,这样访问可以获得和点击按钮一样的15个。
testHttpGet()同样正常获取: