HttpClient抓取网页内容简单介绍

最新推荐文章于 2018-08-12 21:47:21 发布

IT独白者

最新推荐文章于 2018-08-12 21:47:21 发布

阅读量662

点赞数

分类专栏： java网络爬虫

java网络爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

下面说的都是HttpClient3.1版本的时候，然后再说HttpClient 4 版本

1、GET方式

第一步、创建一个客户端，类似于你用浏览器打开一个网页

HttpClient httpClient = new HttpClient();

第二步、创建一个GET方法，用来获取到你需要抓取的网页URL

GetMethod getMethod = new GetMethod("http://www.baidu.com");

第三步、获得网址的响应状态码，200表示请求成功

int statusCode = httpClient.executeMethod(getMethod);

第四步、获取网页的源码

byte[] responseBody = getMethod.getResponseBody();

主要就这四步，当然还有其他很多东西，比如网页编码的问题

[java]view plaincopy 
   
print?
 HttpClient httpClient = new HttpClient();  
        GetMethod getMethod = new GetMethod("http://www.baidu.com/");  
        try {  
            int statusCode = httpClient.executeMethod(getMethod);  
            if (statusCode != HttpStatus.SC_OK) {  
                System.err.println("Method failed: "  
                        + getMethod.getStatusLine());  
            }  
            // 读取内容  
            byte[] responseBody = getMethod.getResponseBody();  
            // 处理内容  
 　　　　　String html = new String(responseBody);  
 　　　　　System.out.println(html);　  
        } catch (Exception e) {  
            System.err.println("页面无法访问");  
        }finally{  
         getMethod.releaseConnection();  
     }  

2、Post方式

[java]view plaincopy 
   
print?
 HttpClient httpClient = new HttpClient();  
        PostMethod postMethod = new PostMethod(UrlPath);  
        postMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());  
        NameValuePair[] postData = new NameValuePair[2];  
        postData[0] = new NameValuePair("username", "xkey");  
        postData[1] = new NameValuePair("userpass", "********");  
        postMethod.setRequestBody(postData);  
        try {  
            int statusCode = httpClient.executeMethod(postMethod);  
            if (statusCode == HttpStatus.SC_OK) {  
                byte[] responseBody = postMethod.getResponseBody();  
                String html = new String(responseBody);  
                System.out.println(html);  
            }  
        } catch (Exception e) {  
            System.err.println("页面无法访问");  
        }finally{  
         postMethod.releaseConnection();  
     }  

这个例子传递了两个Post参数：username为xkey，userpass为********，传递给网址UrlPath

如果需要了解获取gzip网页的信息可以参考http://www.cnblogs.com/modou/articles/1325569.html

另外就是获取非字符数据，这样可以使用下面的方法

[java]view plaincopy 
   
print?
 HttpClient httpClient = new HttpClient();  
        GetMethod getMethod = new GetMethod("http://www.baidu.com");  
        try {  
            InputStream inputStream = getMethod.getResponseBodyAsStream();  
            // 这里处理 inputStream  
        } catch (Exception e) {  
            System.err.println("页面无法访问");  
        }finally{  
         getMethod.releaseConnection();  
     }  

HttpClient4.0（摘录）

[java]view plaincopy 
   
print?
 class HttpClientTest {  
   
 public final static void main(String[] args) throws Exception {  
   
        // 初始化，此处构造函数就与3.1中不同  
        HttpClient httpclient = new DefaultHttpClient();  
   
        HttpHost targetHost = new HttpHost("www.google.cn");  
        HttpGet httpget = new HttpGet("/");  
   
        // 查看默认request头部信息  
        System.out.println("Accept-Charset:" + httpget.getFirstHeader("Accept-Charset"));  
        // 以下这条如果不加会发现无论你设置Accept-Charset为gbk还是utf-8，他都会默认返回gb2312（本例针对google.cn来说）  
        httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.2)");  
        // 用逗号分隔显示可以同时接受多种编码  
        httpget.setHeader("Accept-Language", "zh-cn,zh;q=0.5");  
        httpget.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");  
        // 验证头部信息设置生效  
        System.out.println("Accept-Charset:" + httpget.getFirstHeader("Accept-Charset").getValue());  
   
        // Execute HTTP request  
        System.out.println("executing request " + httpget.getURI());  
         
       HttpResponse response = httpclient.execute(targetHost, httpget);  
        //HttpResponse response = httpclient.execute(httpget);  
   
        System.out.println("----------------------------------------");  
        System.out.println("Location: " + response.getLastHeader("Location"));  
        System.out.println(response.getStatusLine().getStatusCode());  
        System.out.println(response.getLastHeader("Content-Type"));  
        System.out.println(response.getLastHeader("Content-Length"));  
         
        System.out.println("----------------------------------------");  
   
        // 判断页面返回状态判断是否进行转向抓取新链接  
        int statusCode = response.getStatusLine().getStatusCode();  
        if ((statusCode == HttpStatus.SC_MOVED_PERMANENTLY) ||  
             (statusCode == HttpStatus.SC_MOVED_TEMPORARILY) ||  
             (statusCode == HttpStatus.SC_SEE_OTHER) ||  
             (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {  
          // 此处重定向处理   此处还未验证  
          String newUri = response.getLastHeader("Location").getValue();  
          httpclient = new DefaultHttpClient();  
          httpget = new HttpGet(newUri);  
          response = httpclient.execute(httpget);  
        }  
   
        // Get hold of the response entity  
        HttpEntity entity = response.getEntity();  
         
        // 查看所有返回头部信息  
        Header headers[] = response.getAllHeaders();  
        int ii = 0;  
        while (ii < headers.length) {  
          System.out.println(headers[ii].getName() + ": " + headers[ii].getValue());  
          ++ii;  
        }  
         
        // If the response does not enclose an entity, there is no need  
        // to bother about connection release  
        if (entity != null) {  
          // 将源码流保存在一个byte数组当中，因为可能需要两次用到该流，  
           byte[] bytes = EntityUtils.toByteArray(entity);  
          String charSet = "";  
            
          // 如果头部Content-Type中包含了编码信息，那么我们可以直接在此处获取  
           charSet = EntityUtils.getContentCharSet(entity);  
   
          System.out.println("In header: " + charSet);  
          // 如果头部中没有，那么我们需要 查看页面源码，这个方法虽然不能说完全正确，因为有些粗糙的网页编码者没有在页面中写头部编码信息  
          if (charSet == "") {  
             regEx="(?=<meta).*?(?<=charset=[\\'|\\\"]?)([[a-z]|[A-Z]|[0-9]|-]*)";  
             p=Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);  
             m=p.matcher(new String(bytes));   // 默认编码转成字符串，因为我们的匹配中无中文，所以串中可能的乱码对我们没有影响  
             result=m.find();  
             if (m.groupCount() == 1) {  
                    charSet = m.group(1);  
             } else {  
                    charSet = "";  
             }  
          }  
          System.out.println("Last get: " + charSet);  
          // 至此，我们可以将原byte数组按照正常编码专成字符串输出（如果找到了编码的话）  
          System.out.println("Encoding string is: " + new String(bytes, charSet));  
        }  
   
        httpclient.getConnectionManager().shutdown();         
 }  
   
 }