Java爬虫(4)——HttpGet vs HttpPost


这期继续深入讲获取“加载更多”内容。

1. HttpClient用HttpGet和HttpPost这两个类来分别处理get和post请求。

 无论是使用HttpGet,还是使用HttpPost,都必须通过如下3步来访问HTTP资源。

 

          1.创建HttpGet或HttpPost对象,将要请求的URL通过构造方法传入HttpGet或HttpPost对象。

          2.使用DefaultHttpClient类的execute方法发送HTTP GET或HTTP POST请求,并返回HttpResponse对象。

          3.通过HttpResponse接口的getEntity方法返回响应信息,并进行相应的处理。

          如果使用HttpPost方法提交HTTP POST请求,则需要使用HttpPost类的setEntity方法设置请求参数。参数则必须用NameValuePair[]数组存储。

 

2.chrome控制台查看请求

 

以下是两个不同的网站点击“加载更多”按钮出现的请求,分别是get方式和post方式。

 

(1)

 


(2)

 

 


观察黄线画出的内容,里面包含我们所需要的超链接,因此我们可以从响应数据中获得我们所需的内容。由于其中包含转义字符,用Gson()来处理比较方便 ,继而再用jsoup解析数据,获得需要的超链接。

 

代码如下:

 

packageedu.nju.opsource.nhandan;

 

importjava.io.IOException;

importjava.io.UnsupportedEncodingException;

importjava.util.ArrayList;

importjava.util.List;

 

importorg.apache.http.HttpEntity;

importorg.apache.http.HttpResponse;

importorg.apache.http.HttpStatus;

importorg.apache.http.NameValuePair;

importorg.apache.http.client.ClientProtocolException;

importorg.apache.http.client.config.RequestConfig;

importorg.apache.http.client.entity.UrlEncodedFormEntity;

importorg.apache.http.client.methods.HttpGet;

importorg.apache.http.client.methods.HttpPost;

importorg.apache.http.impl.client.CloseableHttpClient;

importorg.apache.http.impl.client.HttpClientBuilder;

importorg.apache.http.message.BasicNameValuePair;

importorg.apache.http.protocol.HTTP;

importorg.apache.http.util.EntityUtils;

importorg.jsoup.Jsoup;

importorg.jsoup.nodes.Document;

importorg.jsoup.nodes.Element;

importorg.jsoup.select.Elements;

 

importcom.google.gson.Gson;

 

 

 

public classCrawler {

   

     public static final int SOCKETTIMEOUT =30000;

     public static final int CONNECTTIMEOUT =30000;

     privateString encode="utf-8";

     privateString url;

     privateCloseableHttpClient client = HttpClientBuilder.create().build();

     private  HttpGet httpGet;

     private  HttpPost httpPost;

     privateString pageContent = null;

     privateDocument doc = null;

     

     

     

     publicString getPageContent() {

       return pageContent;

    }

    public voidsetPageContent(String pageContent) {

       this.pageContent = pageContent;

    }

    publicDocument getDoc() {

       return doc;

    }

    public voidsetDoc(Document doc) {

       this.doc = doc;

    }

    publicCrawler(String url) {

       super();

       this.url = url;

    }

    public  HttpGet getNormalHttpGet(){

         HttpGet httpGet = newHttpGet(this.url);

        httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;");

        httpGet.addHeader("Accept-Language", "zh-cn");

        httpGet.addHeader("User-Agent", "Mozilla/5.0(Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.3) Gecko/2008092417Firefox/3.0.3");

        RequestConfig requestConfig =RequestConfig.custom().setSocketTimeout(SOCKETTIMEOUT).setConnectTimeout(CONNECTTIMEOUT).build();

        httpGet.setConfig(requestConfig);

        return httpGet;

 

    }

    public  HttpPost getNormalHttpPost(){

     HttpPost httpPost = newHttpPost(this.url);//创建HttpPost对象 

    

     List <NameValuePair> params = newArrayList<NameValuePair>(); 

     params.add(newBasicNameValuePair("cate_id", "1003894")); 

     params.add(newBasicNameValuePair("page", "2")); 

      

         try {

           httpPost.setEntity(newUrlEncodedFormEntity(params,HTTP.UTF_8));

       } catch(UnsupportedEncodingException e) {

           // TODOAuto-generated catch block

           e.printStackTrace();

       }

     return httpPost;

     }

     privateString convertToThisCharset(String rst){

           try{

               String tmp = new String( rst.getBytes(this.encode), "utf-8");

               return tmp;

           }catch (Exception e){

           }

 

           return null;

        }

     publicString runByGet(){

           HttpResponse response = null;

           httpGet=getNormalHttpGet();

           try {

               response = client.execute(httpGet);

               if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK) {

                    HttpEntity entity = response.getEntity();

                    String respContent =EntityUtils.toString(entity, this.encode).trim();

                 

                    if( !this.encode.equals("UTF-8") )

                        respContent =convertToThisCharset( respContent );

                    return respContent;

               }

           } catch (ClientProtocolException e) {

               e.printStackTrace();

 

           } catch (IOException e){  

               e.printStackTrace();

 

           } finally{

             

           }

           return null;

        }

     publicString runByPost(){

           HttpResponse response = null;

           httpPost=getNormalHttpPost();

           try {

               response = client.execute(httpPost);

               if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK) {

                    HttpEntity entity = response.getEntity();

                    String respContent =EntityUtils.toString(entity, this.encode).trim();

                 

                    if( !this.encode.equals("UTF-8") )

                        respContent =convertToThisCharset( respContent );

                    return respContent;

               }

           } catch (ClientProtocolException e) {

               e.printStackTrace();

 

           } catch (IOException e){  

               e.printStackTrace();

 

           } finally{

             

           }

           return null;

        }

     public voidget(String url)//赋值给this.doc

        {

          this.pageContent =runByGet();

          if (this.pageContent != null){

           this.doc =Jsoup.parse(this.pageContent);

          

         }

          else{

           

           System.out.println(" ... crawled failed.");

         }

        }

     

     public static voidtestHttpPost(){

               

        String url="http://e.vnexpress.net/category/loadmorenews";

        

        

        Crawler c=newCrawler(url);

        c.setPageContent(c.runByPost())  ;

       String resData=c.getPageContent();

       System.out.println(resData);

       Gson gson=newGson();

       InnerResponseByHttpPost res=gson.fromJson(resData,InnerResponseByHttpPost.class);

      

       System.out.println(res);

       Document resDoc=Jsoup.parse(res.getMessage());

       System.out.println(resDoc);

        Elements hrefs = resDoc.select("h4.title_news_sitea");

        

        for(Elemente : hrefs){

            String href=e.attr("abs:href");

            System.out.println(href);

        }

       

        

     }

     public static voidTestHttpGet(){

        

        String url="http://thestandard.com.ph/api/category/json?page=3&category=1&column=0&totItems=40500&currentItems=16";

        Crawler c=newCrawler(url);

            c.get(url);

            

       String resData=c.getPageContent();

       Gson gson=newGson();

       InnerResponseByHttpGet res=gson.fromJson(resData,InnerResponseByHttpGet.class);

      

       System.out.println(res);

       Document resDoc=Jsoup.parse(res.getData());

       System.out.println(resDoc);

        Elements hrefs = resDoc.select("div.img-container-masonrya");

        

        for(Elemente : hrefs){

            String href=e.attr("abs:href");

            System.out.println(href);

        }

     

     }

     public static voidmain(String []args){

        testHttpPost();

     }

      //内部类,对应response数据,用于Gson解析

     public classInnerResponseByHttpGet{

           privateString type;

           private boolean lastpage;

           privateString data;

          

          

           publicString getData() {

              return data;

           }

 

 

           public voidsetData(String data) {

              this.data = data;

           }

 

 

           @Override

           publicString toString() {

              return "InnerResponse[type=" + type + ",lastpage=" + lastpage + ",data=" + data + "]";

           }

          

          

          

       }

     

     public classInnerResponseByHttpPost{

           private int error;

           privateString message;

           private boolean end;

          

          

           publicString getMessage() {

              return message;

           }

 

 

           public voidsetMessage(String message) {

              this.message = message;

           }

     }

}

testHttpPost()运行结果如下:

 

还是上期使用的那个url,上期的拼接可以获得13个url,这样访问可以获得和点击按钮一样的15个。

 

testHttpGet()同样正常获取:

 

 

                                

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值