HttpClient4+Jsoup实现Java爬虫

最近看了看java爬虫相关资料,于是想自己做一个Demo试试,测试爬去http://www.youdaili.net/Daili/QQ/36817.html该页面的全部数据,包括分页数据。多的不说,直接上代码。

工具类中使用到代理IP,获取地址http://www.xicidaili.com/ 有的ip可能不能用,多试几个。

1、HttpClient4 工具类

package com.dairuijie.Util;
import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.StandardHttpRequestRetryHandler;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

/**
 * 

* <p>Title: HttpClient4Utils</p>  

* <p>Description: HttpClient工具类</p>  

* @author dairuijie  

* @date 2018年4月1日
 */
public class HttpClient4Utils {
    public static HttpClient defaultClient = createHttpClient(20, 20, 5000, 5000, 3000);

    /**
     * 实例化HttpClient
     *
     * @param maxTotal
     * @param maxPerRoute
     * @param socketTimeout
     * @param connectTimeout
     * @param connectionRequestTimeout
     * @return
     */
    public static HttpClient createHttpClient(int maxTotal, int maxPerRoute, int socketTimeout, int connectTimeout,
                                              int connectionRequestTimeout) {
        RequestConfig defaultRequestConfig = RequestConfig.custom()
                .setSocketTimeout(socketTimeout)
                .setConnectTimeout(connectTimeout)
                .setProxy(new HttpHost("183.159.90.81", 18118))//代理服务Ip
                .setConnectionRequestTimeout(connectionRequestTimeout).build();

        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        cm.setMaxTotal(maxTotal);
        cm.setDefaultMaxPerRoute(maxPerRoute);
        cm.setValidateAfterInactivity(200); // 一个连接idle超过200ms,再次被使用之前,需要先做validation
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(cm)
                .setConnectionTimeToLive(30, TimeUnit.SECONDS)
                .setRetryHandler(new StandardHttpRequestRetryHandler(3, true)) // 配置出错重试
                .setDefaultRequestConfig(defaultRequestConfig).build();

        startMonitorThread(cm);

        return httpClient;
    }

    /**
     * 增加定时任务, 每隔一段时间清理连接
     *
     * @param cm
     */
    private static void startMonitorThread(final PoolingHttpClientConnectionManager cm) {
        Thread t = new Thread(new Runnable() {
            @Override
            public void run() {
                while (true) {
                    try {
                        cm.closeExpiredConnections();
                        cm.closeIdleConnections(30, TimeUnit.SECONDS);

                        // log.info("closing expired & idle connections, stat={}", cm.getTotalStats());
                        TimeUnit.SECONDS.sleep(10);
                    } catch (Exception e) {
                        // ignore exceptoin
                    }
                }
            }
        });
        t.setDaemon(true);
        t.start();
    }

    /**
     * 发送post请求
     *
     * @param httpClient
     * @param url        请求地址
     * @param params     请求参数
     * @param encoding   编码
     * @return
     */
    public static String sendPost(HttpClient httpClient, String url, Map<String, String> params, Charset encoding) {
        String resp = "";
      //  HttpPost httpPost = new HttpPost(url);
        HttpGet httpPost = new HttpGet(url);
        if (params != null && params.size() > 0) {
            List<NameValuePair> formParams = new ArrayList<NameValuePair>();
            Iterator<Map.Entry<String, String>> itr = params.entrySet().iterator();
            while (itr.hasNext()) {
                Map.Entry<String, String> entry = itr.next();
                formParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
            }
            UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(formParams, encoding);
            //httpPost.set
        }
        CloseableHttpResponse response = null;
        try {
            response =  (CloseableHttpResponse) httpClient.execute(httpPost);
            resp = EntityUtils.toString(response.getEntity(), encoding);
        } catch (Exception e) {
            // log
            e.printStackTrace();
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    // log
                    e.printStackTrace();
                }
            }
        }
        return resp;
    }

    /**
     * 发送post请求
     *
     * @param url    请求地址
     * @param params 请求参数
     * @return
     */
    public static String sendPost(String  url, Map<String, String> params) {
        Charset encoding = Charset.forName("utf8");
        return sendPost(defaultClient, url, params, encoding);
    }
}

2、通过Http请求获取响应的数据,Jsoup解析数据。

package com.dairuijie.reptile;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.netease.is.nc.sdk.utils.HttpClient4Utils;

/**
 * 爬虫测试
 * 
 * <p>
 * Title: ReptileDemo
 * </p>
 * 
 * <p>
 * Description:
 * </p>
 * 
 * @author dairuijie
 * 
 * @date 2018年5月7日
 */
public class ReptileDemo {
	public static StringBuilder URL = new StringBuilder(
			"http://www.youdaili.net/Daili/QQ/36817.html");// http://www.itcast.cn/
	public static Integer len = URL.length()-5;
	public static Integer count = 1;
	public static String reps = null;
	public static void main(String[] args) {
		selectInfo();
		while (!reps .contains("404 Not Found")) {
			count++;
			if(count > 2){
				URL.replace(len+1, len+2, count.toString());
			}else{
				URL.insert(len, "_" + count);
			}
			System.err.println(URL);
			selectInfo();
		}
	}

	public static void selectInfo() {
		reps = HttpClient4Utils.sendPost(URL.toString(), null);
		Document doc = Jsoup.parse(reps);
		Elements links = doc.select("p");
		for (Element ele : links) {
			String title = ele.text();
			if (title.contains("@")) {
				System.out.println(title);
			}
		}
		//System.out.println(reps);
	}
}

3、结果截图

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

蜗牛乌龟一起走

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值