JAVA使用HttpClient实现爬虫技术

1. pom文件中加入httpClient依赖包

<dependency>
   <groupId>org.apache.httpcomponents</groupId>
   <artifactId>httpclient</artifactId>
   <version>4.3.1</version>
</dependency>

2. 创建一个调用httpClient的工具类

public class HttpClientUtil {


    private CloseableHttpClient closeableHttpClient;

    private RequestConfig requestConfig;

    // 最大的连接数
    private int maxTotal = 10;

    // 最大的并发数
    private int defaultMaxPerRoute = 5;

    // 连接超时数
    private int connectTimeOut = 2000;

    // 数据传输的最长时间
    private int socketTimeout = 10000;

    // 在连接之前测试连接可不可用
    private boolean staleConnectionCheckEnabled = true;

    // 从数据池中获取连接的最长时间
    private int connectionRequestTimeOut = 500;



    public HttpClientUtil() {

        createCloseableHttpClient();
        createRequestConfig();
    }

    /**
     *  创建CloseableHttpClient
     */
    private void createCloseableHttpClient() {

        PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();
        connectionManager.setMaxTotal(maxTotal);
        connectionManager.setDefaultMaxPerRoute(defaultMaxPerRoute);

        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
        httpClientBuilder.setConnectionManager(connectionManager);
        this.closeableHttpClient = httpClientBuilder.build();
    }

    /**
     *  创建requestConfig
     */
    private void createRequestConfig() {

        RequestConfig.Builder custom = RequestConfig.custom();
        custom.setConnectTimeout(connectTimeOut)
                .setSocketTimeout(socketTimeout)
                .setStaleConnectionCheckEnabled(staleConnectionCheckEnabled)
                .setConnectionRequestTimeout(connectionRequestTimeOut);

        this.requestConfig = custom.build();

    }

    /**
     *  get请求不带参数
     * @param url
     * @return
     * @throws Exception
     */
    public String doGet(String url) throws Exception {

        // 先获取地址的请求对象
        HttpGet httpGet = new HttpGet(url);
        // 配置参数
        httpGet.setConfig(requestConfig);
        // 执行请求
        CloseableHttpResponse response = closeableHttpClient.execute(httpGet);

        if (response.getStatusLine().getStatusCode() == 200) {
            return EntityUtils.toString(response.getEntity(),"UTF-8");
        }
        return null;
    }

    /**
     *  get请求带参数
     * @param url
     * @param map
     * @return
     * @throws Exception
     */
    public String doGet(String url, Map<String, Object> map) throws Exception {

        URIBuilder uriBuilder = new URIBuilder(url);
        if (map != null) {
            Set<Map.Entry<String, Object>> entrySet = map.entrySet();
            for (Map.Entry<String, Object> entry : entrySet) {
                uriBuilder.addParameter(entry.getKey(),entry.getValue().toString());
            }
        }
        return this.doGet(uriBuilder.build().toString());
    }


    /**
     *  带参数的post请求
     * @param url
     * @param map
     * @return
     * @throws Exception
     */
    public String doPost(String url, Map<String, Object> map) throws Exception {

        HttpPost httpPost = new HttpPost(url);
        httpPost.setConfig(requestConfig);
        if (map != null) {
            List<NameValuePair> pairList = new ArrayList<>();
            Set<Map.Entry<String, Object>> entrySet = map.entrySet();
            for (Map.Entry<String,Object> entry : entrySet) {
                pairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue().toString()));
            }
            UrlEncodedFormEntity entity = new UrlEncodedFormEntity(pairList);
            httpPost.setEntity(entity);
        }

        CloseableHttpResponse response = closeableHttpClient.execute(httpPost);
        if (response.getStatusLine().getStatusCode() == 200) {
            return EntityUtils.toString(response.getEntity(),"UTF-8");
        }

        return null;
    }

    /**
     *  post请求不带参数
     * @param url
     * @return
     * @throws Exception
     */
    public String doPost(String url) throws Exception {

        return this.doPost(url,null);
    }

3. 利用unit进行单元测试

@Test
public void test1() {

    String url = "https://blog.csdn.net/javalixy/article/details/76284524";
    HttpClientUtil clientUtil = new HttpClientUtil();
    try {
        String result = clientUtil.doGet(url);
        parseHtml(result);
    } catch (Exception e) {
        e.printStackTrace();
    }

可以得到返回的Html页面

4.使用开源框架Jsoup进行html页面的解析

4.1  加入jsoup依赖

<dependency>
   <groupId>org.jsoup</groupId>
   <artifactId>jsoup</artifactId>
   <version>1.7.3</version>
</dependency>

4.2 jsoup解析页面

private void parseHtml(String result) {

    Document document = Jsoup.parse(result);
    Elements linkElements = document.select("link[href]");
    Elements textElements = document.select("span");
    Elements imgElements = document.select("img");

    System.out.println(String.format("LinkElements: (%d)", linkElements.size()));
    System.out.println(String.format("TextElements: (%d)", textElements.size()));
    System.out.println(String.format("ImgElements: (%d)", imgElements.size()));

    for(Element link : linkElements){
        print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
    }
    for (Element text : textElements) {
        print("* text: <%s> (%s)",text.attr("abs:class"), trim(text.text(),35));
    }
    for (Element img : imgElements) {
        print("* text: <%s> (%s)",img.attr("abs:src"), trim(img.text(),35));
    }
}

private void print(String str , Object...msg) {
    System.out.println(String.format(str,msg));
}

private static String trim(String str, int width){
    if(str.length() > width) {
        return str.substring(0, width + 1) + ".";
    } else {
        return str;
    }
}

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值