最近看了看java爬虫相关资料,于是想自己做一个Demo试试,测试爬去http://www.youdaili.net/Daili/QQ/36817.html该页面的全部数据,包括分页数据。多的不说,直接上代码。
工具类中使用到代理IP,获取地址http://www.xicidaili.com/ 有的ip可能不能用,多试几个。
1、HttpClient4 工具类
package com.dairuijie.Util;
import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.StandardHttpRequestRetryHandler;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
*
* <p>Title: HttpClient4Utils</p>
* <p>Description: HttpClient工具类</p>
* @author dairuijie
* @date 2018年4月1日
*/
public class HttpClient4Utils {
public static HttpClient defaultClient = createHttpClient(20, 20, 5000, 5000, 3000);
/**
* 实例化HttpClient
*
* @param maxTotal
* @param maxPerRoute
* @param socketTimeout
* @param connectTimeout
* @param connectionRequestTimeout
* @return
*/
public static HttpClient createHttpClient(int maxTotal, int maxPerRoute, int socketTimeout, int connectTimeout,
int connectionRequestTimeout) {
RequestConfig defaultRequestConfig = RequestConfig.custom()
.setSocketTimeout(socketTimeout)
.setConnectTimeout(connectTimeout)
.setProxy(new HttpHost("183.159.90.81", 18118))//代理服务Ip
.setConnectionRequestTimeout(connectionRequestTimeout).build();
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
cm.setMaxTotal(maxTotal);
cm.setDefaultMaxPerRoute(maxPerRoute);
cm.setValidateAfterInactivity(200); // 一个连接idle超过200ms,再次被使用之前,需要先做validation
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager(cm)
.setConnectionTimeToLive(30, TimeUnit.SECONDS)
.setRetryHandler(new StandardHttpRequestRetryHandler(3, true)) // 配置出错重试
.setDefaultRequestConfig(defaultRequestConfig).build();
startMonitorThread(cm);
return httpClient;
}
/**
* 增加定时任务, 每隔一段时间清理连接
*
* @param cm
*/
private static void startMonitorThread(final PoolingHttpClientConnectionManager cm) {
Thread t = new Thread(new Runnable() {
@Override
public void run() {
while (true) {
try {
cm.closeExpiredConnections();
cm.closeIdleConnections(30, TimeUnit.SECONDS);
// log.info("closing expired & idle connections, stat={}", cm.getTotalStats());
TimeUnit.SECONDS.sleep(10);
} catch (Exception e) {
// ignore exceptoin
}
}
}
});
t.setDaemon(true);
t.start();
}
/**
* 发送post请求
*
* @param httpClient
* @param url 请求地址
* @param params 请求参数
* @param encoding 编码
* @return
*/
public static String sendPost(HttpClient httpClient, String url, Map<String, String> params, Charset encoding) {
String resp = "";
// HttpPost httpPost = new HttpPost(url);
HttpGet httpPost = new HttpGet(url);
if (params != null && params.size() > 0) {
List<NameValuePair> formParams = new ArrayList<NameValuePair>();
Iterator<Map.Entry<String, String>> itr = params.entrySet().iterator();
while (itr.hasNext()) {
Map.Entry<String, String> entry = itr.next();
formParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(formParams, encoding);
//httpPost.set
}
CloseableHttpResponse response = null;
try {
response = (CloseableHttpResponse) httpClient.execute(httpPost);
resp = EntityUtils.toString(response.getEntity(), encoding);
} catch (Exception e) {
// log
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
// log
e.printStackTrace();
}
}
}
return resp;
}
/**
* 发送post请求
*
* @param url 请求地址
* @param params 请求参数
* @return
*/
public static String sendPost(String url, Map<String, String> params) {
Charset encoding = Charset.forName("utf8");
return sendPost(defaultClient, url, params, encoding);
}
}
2、通过Http请求获取响应的数据,Jsoup解析数据。
package com.dairuijie.reptile;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.netease.is.nc.sdk.utils.HttpClient4Utils;
/**
* 爬虫测试
*
* <p>
* Title: ReptileDemo
* </p>
*
* <p>
* Description:
* </p>
*
* @author dairuijie
*
* @date 2018年5月7日
*/
public class ReptileDemo {
public static StringBuilder URL = new StringBuilder(
"http://www.youdaili.net/Daili/QQ/36817.html");// http://www.itcast.cn/
public static Integer len = URL.length()-5;
public static Integer count = 1;
public static String reps = null;
public static void main(String[] args) {
selectInfo();
while (!reps .contains("404 Not Found")) {
count++;
if(count > 2){
URL.replace(len+1, len+2, count.toString());
}else{
URL.insert(len, "_" + count);
}
System.err.println(URL);
selectInfo();
}
}
public static void selectInfo() {
reps = HttpClient4Utils.sendPost(URL.toString(), null);
Document doc = Jsoup.parse(reps);
Elements links = doc.select("p");
for (Element ele : links) {
String title = ele.text();
if (title.contains("@")) {
System.out.println(title);
}
}
//System.out.println(reps);
}
}
3、结果截图