爬虫简单示例，用httpClient4.2.1实现（转载）

最新推荐文章于 2021-10-26 11:28:30 发布

whxhz80

最新推荐文章于 2021-10-26 11:28:30 发布

阅读量115

点赞数

文章标签：爬虫 java xhtml

本文链接：https://blog.csdn.net/whxhz/article/details/84795644

版权

HttpConnectionManager.java

 
   package spider; 
   
   import java.io.BufferedReader; 
   
   import java.io.ByteArrayOutputStream; 
   
   import java.io.IOException; 
   
   import java.io.InputStream; 
   
   import java.io.InputStreamReader; 
   
   import java.util.ArrayList; 
   
   import java.util.Collection; 
   
   import java.util.Date; 
   
   import java.util.zip.GZIPInputStream; 
   
   import javax.net.ssl.SSLHandshakeException; 
   
   import org.apache.commons.lang.StringUtils; 
   
   import org.apache.http.Header; 
   
   import org.apache.http.HttpEntity; 
   
   import org.apache.http.HttpEntityEnclosingRequest; 
   
   import org.apache.http.HttpHost; 
   
   import org.apache.http.HttpRequest; 
   
   import org.apache.http.HttpResponse; 
   
   import org.apache.http.HttpVersion; 
   
   import org.apache.http.NoHttpResponseException; 
   
   import org.apache.http.ParseException; 
   
   import org.apache.http.StatusLine; 
   
   import org.apache.http.client.ClientProtocolException; 
   
   import org.apache.http.client.HttpRequestRetryHandler; 
   
   import org.apache.http.client.entity.GzipDecompressingEntity; 
   
   import org.apache.http.client.methods.HttpGet; 
   
   import org.apache.http.client.params.ClientPNames; 
   
   import org.apache.http.client.params.CookiePolicy; 
   
   import org.apache.http.conn.params.ConnManagerParams; 
   
   import org.apache.http.conn.params.ConnRoutePNames; 
   
   import org.apache.http.conn.routing.HttpRoute; 
   
   import org.apache.http.conn.scheme.PlainSocketFactory; 
   
   import org.apache.http.conn.scheme.Scheme; 
   
   import org.apache.http.conn.scheme.SchemeRegistry; 
   
   import org.apache.http.conn.ssl.SSLSocketFactory; 
   
   import org.apache.http.entity.ContentType; 
   
   import org.apache.http.impl.client.DefaultHttpClient; 
   
   import org.apache.http.impl.conn.PoolingClientConnectionManager; 
   
   import org.apache.http.message.BasicHeader; 
   
   import org.apache.http.params.BasicHttpParams; 
   
   import org.apache.http.params.CoreConnectionPNames; 
   
   import org.apache.http.params.CoreProtocolPNames; 
   
   import org.apache.http.params.HttpParams; 
   
   import org.apache.http.protocol.ExecutionContext; 
   
   import org.apache.http.protocol.HttpContext; 
   
   import org.apache.http.util.EntityUtils; 
   
   /** 
   
 * http连接、抓取管理类
 *  
   @author 
    lidongyang
 * @createtime Oct 18, 2012 1:55:18 PM
 * 
 * @note 基本测试版
  
   */ 
   
   public  
   class HttpConnectionManager { 
   
   /** 
    
     * 连接池里的最大连接数
      
   */   
   
   public  
   static  
   final  
   int MAX_TOTAL_CONNECTIONS = 100; 
   
   /** 
    
     * 每个路由的默认最大连接数
      
   */   
   
   public  
   static  
   final  
   int MAX_ROUTE_CONNECTIONS = 50; 
   
   /** 
    
     * 连接超时时间
      
   */   
   
   public  
   static  
   final  
   int CONNECT_TIMEOUT = 50000; 
   
   /** 
   
     * 套接字超时时间
      
   */ 
   
   public  
   static  
   final  
   int SOCKET_TIMEOUT = 50000; 
   
   /** 
   
     * 连接池中 连接请求执行被阻塞的超时时间
      
   */ 
   
   public  
   static  
   final  
   long CONN_MANAGER_TIMEOUT = 60000; 
   
   /** 
   
     * http连接相关参数
      
   */ 
   
   private  
   static HttpParams parentParams; 
   
   /** 
   
     * http线程池管理器
      
   */ 
   
   private  
   static PoolingClientConnectionManager cm; 
   
   /** 
   
     * http客户端
      
   */ 
   
   private  
   static DefaultHttpClient httpClient; 
   
   /** 
   
     * 默认目标主机
      
   */ 
   
   private  
   static  
   final HttpHost DEFAULT_TARGETHOST =  
   new HttpHost("http://www.qq.com", 80); 
   
   /** 
   
     * 初始化http连接池，设置参数、http头等等信息
      
   */ 
   
   static { 
   
        SchemeRegistry schemeRegistry =  
   new SchemeRegistry(); 
   
        schemeRegistry.register( 
   
   new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); 
   
        schemeRegistry.register( 
   
   new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); 
   
        cm =  
   new PoolingClientConnectionManager(schemeRegistry); 
   
        cm.setMaxTotal(MAX_TOTAL_CONNECTIONS); 
   
        cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS); 
   
        cm.setMaxPerRoute( 
   new HttpRoute(DEFAULT_TARGETHOST), 20);         
   // 
   设置对目标主机的最大连接数 
   
        parentParams =  
   new BasicHttpParams();  
   
        parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1); 
   
        parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST);     
   // 
   设置默认targetHost 
   
        parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); 
   
        parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT); 
   
        parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT); 
   
        parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT); 
   
        parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS,  
   true); 
   
        parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS,  
   true); 
   
   // 
   设置头信息,模拟浏览器 
   
        Collection 
   
 collection =  
   new ArrayList 
   
(); 
   
        collection.add( 
   new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)")); 
   
        collection.add( 
   new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")); 
   
        collection.add( 
   new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5")); 
   
        collection.add( 
   new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7")); 
   
        collection.add( 
   new BasicHeader("Accept-Encoding", "gzip, deflate")); 
   
        parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection); 
   
   // 
   请求重试处理 
   
        HttpRequestRetryHandler httpRequestRetryHandler =  
   new HttpRequestRetryHandler() { 
   
   public  
   boolean retryRequest(IOException exception,  
   int executionCount, HttpContext context) { 
   
   if (executionCount >= 5) { 
   
   // 
    如果超过最大重试次数，那么就不要继续了 
   
   return  
   false; 
   
                } 
   
   if (exception  
   instanceof NoHttpResponseException) { 
   
   // 
    如果服务器丢掉了连接，那么就重试 
   
   return  
   true; 
   
                } 
   
   if (exception  
   instanceof SSLHandshakeException) { 
   
   // 
    不要重试SSL握手异常 
   
   return  
   false; 
   
                } 
   
                HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST); 
   
   boolean idempotent = !(request  
   instanceof HttpEntityEnclosingRequest); 
   
   if (idempotent) { 
   
   // 
    如果请求被认为是幂等的，那么就重试 
   
   return  
   true; 
   
                } 
   
   return  
   false; 
   
            } 
   
        }; 
   
        httpClient =  
   new DefaultHttpClient(cm, parentParams); 
   
        httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler); 
   
    } 
   
   /** 
   
     * 抓取页面代码
     *  
   @param 
    url 目标页面的url
     *  
   @return 
    页面代码
      
   */ 
   
   public String getHtml(String url) { 
   
        HttpHost proxyHost =  
   new HttpHost("211.142.236.137", 8080); 
   // 
   代理 
   
        String html = getHtml(url, proxyHost); 
   
   int count = 0; 
   
   while(StringUtils.isEmpty(html)){ 
   
            proxyHost =  
   new HttpHost("211.142.236.137", 80); 
   // 
   更换代理 
   
            html = getHtml(url, proxyHost); 
   
            count++; 
   
   if(count > 3){ 
   
                System.out.println("抓取失败"); 
   
   break; 
   
            } 
   
        } 
   
System.out.println(html.length()); 
   
   return html; 
   
    } 
   
   /** 
   
     * 抓取url所指的页面代码
     *  
   @param 
    url 目标页面的url
     *  
   @return 
    页面代码
      
   */ 
   
   public String getHtml(String url, HttpHost proxyHost) { 
   
        String html = ""; 
   
        HttpGet httpGet =  
   new HttpGet(url); 
   
        httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost); 
   // 
   设置代理 
   
        HttpResponse httpResponse; 
   
        HttpEntity httpEntity; 
   
   try { 
   
            httpResponse = httpClient.execute(httpGet); 
   
            StatusLine statusLine = httpResponse.getStatusLine(); 
   
   int statusCode = statusLine.getStatusCode(); 
   
System.out.println(statusCode); 
   
   if(200 != statusCode) { 
   
   return html; 
   
            } 
   
            httpEntity = httpResponse.getEntity(); 
   
   if(httpEntity !=  
   null){ 
   
                html = readHtmlContentFromEntity(httpEntity); 
   
            } 
   
        }  
   catch (ClientProtocolException e) { 
   
   // 
    TODO Auto-generated catch block 
   
            e.printStackTrace(); 
   
        }  
   catch (IOException e) { 
   
   // 
    TODO Auto-generated catch block 
   
            e.printStackTrace(); 
   
        }  
   finally { 
   
   if(httpGet !=  
   null){ 
   
                httpGet.releaseConnection(); 
   
            } 
   
        } 
   
   return html; 
   
    } 
   
   /** 
   
     * 从response返回的实体中读取页面代码
     *  
   @param 
    httpEntity Http实体
     *  
   @return 
    页面代码
     *  
   @throws 
    ParseException
     *  
   @throws 
    IOException
      
   */ 
   
   private String readHtmlContentFromEntity(HttpEntity httpEntity)  
   throws ParseException, IOException { 
   
        String html = ""; 
   
        Header header = httpEntity.getContentEncoding(); 
   
   if(httpEntity.getContentLength() < 2147483647L){             
   // 
   EntityUtils无法处理ContentLength超过2147483647L的Entity 
   
   if(header !=  
   null && "gzip".equals(header.getValue())){ 
   
                html = EntityUtils.toString( 
   new GzipDecompressingEntity(httpEntity)); 
   
            }  
   else { 
   
                html = EntityUtils.toString(httpEntity); 
   
            } 
   
        }  
   else { 
   
            InputStream in = httpEntity.getContent(); 
   
   if(header !=  
   null && "gzip".equals(header.getValue())){ 
   
                html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString()); 
   
            }  
   else { 
   
                html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString()); 
   
            } 
   
   if(in !=  
   null){ 
   
                in.close(); 
   
            } 
   
        } 
   
   return html; 
   
    } 
   
   /** 
   
     * 测试代理是否可用（其实和getHtml(String url, HttpHost proxyHost)的代码差不多，为了从功能上区别，暂时这样）
     *  
   @param 
    httpHost 封装了代理的ip地址和端口
     *  
   @param 
    url 用来测试的页面
     *  
   @return 
    true 可用 false 不可用
      
   */ 
   
   public  
   boolean isProxyUsable(HttpHost proxyHost, String url) { 
   
        HttpGet httpGet =  
   new HttpGet(url); 
   
        httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost); 
   
   try { 
   
            HttpResponse httpResponse = httpClient.execute(httpGet); 
   
            StatusLine statusLine = httpResponse.getStatusLine(); 
   
   int statusCode = statusLine.getStatusCode(); 
   
System.out.println(statusCode); 
   
   if(200 != statusCode) { 
   
   return  
   false; 
   
            } 
   
            HttpEntity httpEntity = httpResponse.getEntity(); 
   
   if(httpEntity !=  
   null) { 
   
                String html = readHtmlContentFromEntity(httpEntity); 
   
System.out.println(html.length()); 
   
   if(StringUtils.isEmpty(html)){ 
   
   return  
   false; 
   
                } 
   
            }  
   else { 
   
   return  
   false; 
   
            } 
   
        }  
   catch (ClientProtocolException e) { 
   
   // 
    TODO Auto-generated catch block 
   
            e.printStackTrace(); 
   
   return  
   false; 
   
        }  
   catch (IOException e) { 
   
   // 
    TODO Auto-generated catch block 
   
            e.printStackTrace(); 
   
   return  
   false; 
   
        } 
   
   return  
   true; 
   
    } 
   
   /** 
   
     * 解压服务器返回的gzip流
     *  
   @param 
    in 抓取返回的InputStream流
     *  
   @param 
    charSet 页面内容编码
     *  
   @return 
    页面内容的String格式
     *  
   @throws 
    IOException
      
   */ 
   
   private String unZip(InputStream in, String charSet)  
   throws IOException { 
   
        ByteArrayOutputStream baos =  
   new ByteArrayOutputStream(); 
   
        GZIPInputStream gis =  
   null; 
   
   try { 
   
            gis =  
   new GZIPInputStream(in); 
   
   byte[] _byte =  
   new  
   byte[1024]; 
   
   int len = 0; 
   
   while ((len = gis.read(_byte)) != -1) { 
   
                baos.write(_byte, 0, len); 
   
            } 
   
            String unzipString =  
   new String(baos.toByteArray(), charSet); 
   
   return unzipString; 
   
        }  
   finally { 
   
   if (gis !=  
   null) { 
   
                gis.close(); 
   
            } 
   
   if(baos !=  
   null){ 
   
                baos.close(); 
   
            } 
   
        } 
   
    } 
   
   /** 
   
     * 读取InputStream流
     *  
   @param 
    in InputStream流
     *  
   @return 
    从流中读取的String
     *  
   @throws 
    IOException
      
   */ 
   
   private String readInStreamToString(InputStream in, String charSet)  
   throws IOException { 
   
        StringBuilder str =  
   new StringBuilder(); 
   
        String line; 
   
        BufferedReader bufferedReader =  
   new BufferedReader( 
   new InputStreamReader(in, charSet)); 
   
   while((line = bufferedReader.readLine()) !=  
   null){ 
   
            str.append(line); 
   
            str.append("\n"); 
   
        } 
   
   if(bufferedReader !=  
   null) { 
   
            bufferedReader.close(); 
   
        } 
   
   return str.toString(); 
   
    } 
   
   /** 
   
     * for test
     *  
   @author 
    lidongyang
     * @createtime Oct 18, 2012 2:35:09 PM
      
   */ 
   
   public  
   class Test  
   implements Runnable { 
   
        String url; 
   
   int threadNum; 
   
   public Test() { 
   
        } 
   
   public Test(String url,  
   int threadNum) { 
   
   this.url = url; 
   
   this.threadNum = threadNum; 
   
        } 
   
        @Override 
   
   public  
   void run() { 
   
            getHtml(url); 
   
        } 
   
    } 
   
   /** 
   
     * for test
     *  
   @param 
    args
     *  
   @throws 
    InterruptedException 
      
   */ 
   
   public  
   static  
   void main(String[] args)  
   throws InterruptedException{ 
   
        HttpConnectionManager httpConnectionManager =  
   new HttpConnectionManager(); 
   
        Date start =  
   new Date(); 
   
        httpConnectionManager.getHtml("http://www.qq.com"); 
   
        Date end =  
   new Date(); 
   
        System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒"); 
   
    } 
   
}

GetQqNews.java

 
 package parser;
 
   import org.jsoup.Jsoup; 
   
   import org.jsoup.nodes.Document; 
   
   import org.jsoup.nodes.Element; 
   
   import org.jsoup.select.Elements; 
   
   import spider.HttpConnectionManager; 
   
   /** 
    test
 *  
   @author 
    lidongyang
 * @createtime Oct 23, 2012 11:05:33 AM
  
   */ 
   
   public  
   class GetQqNews { 
   
   public  
   static  
   void main(String[] args){ 
   
        HttpConnectionManager httpConnectionManager =  
   new HttpConnectionManager(); 
   
        String html = httpConnectionManager.getHtml("http://www.qq.com"); 
   
        Document doc = Jsoup.parse(html); 
   
        Elements newsList = doc.select("[class=ft fl]").select("ul").select("li").select("a"); 
   
   for (Element element : newsList) { 
   
            System.out.println(element.attr("href") + "----" + element.text()); 
   
        } 
   
    } 
   
}

whxhz80

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫简单示例，用httpClient4.2.1实现（转载）

HttpConnectionManager.javapackage spider;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamRead...
复制链接

扫一扫