http协议时无连接的,本质上是需要和服务器建立连接的。对于需要从同一个站点抓取大量网页的程序,应该使用连接池,否则每次抓取都和Web站点建立连接、发送请求、获得响应、释放连接,一方面效率不高,另一方面稍不小心就会疏忽了某些资源的释放、导致站点拒绝连接(很多站点会拒绝同一个ip的大量连接、防止DOS攻击)。
packagetest.ffm83.commons.httpClient;
importjava.util.concurrent.TimeUnit;
importorg.apache.commons.lang.StringUtils;
importorg.apache.commons.lang.time.StopWatch;
importorg.apache.http.HttpHost;
importorg.apache.http.auth.AuthScope;
importorg.apache.http.auth.UsernamePasswordCredentials;
importorg.apache.http.client.CredentialsProvider;
importorg.apache.http.client.config.RequestConfig;
importorg.apache.http.client.methods.CloseableHttpResponse;
importorg.apache.http.client.methods.HttpGet;
importorg.apache.http.conn.HttpClientConnectionManager;
importorg.apache.http.impl.client.BasicCredentialsProvider;
importorg.apache.http.impl.client.CloseableHttpClient;
importorg.apache.http.impl.client.HttpClients;
importorg.apache.http.impl.conn.PoolingHttpClientConnectionManager;
importorg.apache.http.util.EntityUtils;
/**
*httpClient 的连接池处理。过期连接的是否和空闲的连接池的利用 基于4.x版本
*
* @author范芳铭
*/
public class EasyExecutePool {
public static void main(String[] args) throws Exception {
StopWatchsw = newStopWatch();
sw.start();
runProxyPools(true); //使用代理,不用代理设置为false即可
sw.stop();
System.out.println("runProxyPools used " + sw.getTime()+ " ms.");
}
public static void runProxyPools(boolean isProxy) throws Exception {
CredentialsProvidercredsProvider = newBasicCredentialsProvider();
credsProvider.setCredentials(new AuthScope("192.168.19.9", 80), // 代理服务器信息
new UsernamePasswordCredentials("fanfangming", "abcd_123")); // 代理服务器
PoolingHttpClientConnectionManagercm = newPoolingHttpClientConnectionManager();
cm.setMaxTotal(100);
CloseableHttpClienthttpclient = null;
// 构建一个使用代理访问的线程池
if (isProxy) {
httpclient= HttpClients.custom()
.setDefaultCredentialsProvider(credsProvider)
.setConnectionManager(cm).build();
}else {
httpclient= HttpClients.custom().setConnectionManager(cm).build();
}
try {
// 这一段是代理使用
HttpHosttarget = newHttpHost("www.ctip.com",80, "http");
HttpHostproxy = newHttpHost("192.168.19.9",80);
RequestConfigconfig = RequestConfig.custom().setProxy(proxy)
.build();
HttpGethttpProxyget = newHttpGet("/");
httpProxyget.setConfig(config);
HttpGetrequestNotProxy = newHttpGet("/");
// create an array of URIs to perform GETs on
String[]urisToGet = {
"http://www.ctrip.com/",
"http://hotels.ctrip.com/",
"http://bus.ctrip.com/", };
IdleConnectionEvictorconnEvictor = newIdleConnectionEvictor(cm);
connEvictor.start();
for (int i = 0; i < urisToGet.length; i++) {
StringrequestURI = urisToGet[i];
System.out.println(StringUtils.center(requestURI,50, '-'));
CloseableHttpResponseresponse = null;
if (isProxy) {
httpProxyget= newHttpGet(requestURI);
httpProxyget.setConfig(config);
response= httpclient.execute(target, httpProxyget);
} else {
requestNotProxy= newHttpGet(requestURI);
response= httpclient.execute(requestNotProxy);
}
try {
System.out.println(response.getStatusLine());
EntityUtils.consume(response.getEntity());
}finally {
response.close();
}
}
// 停20秒,等线程完成工作
Thread.sleep(20000);
// 关闭已经执行完毕的线程
connEvictor.shutdown();
connEvictor.join();//Waits for this thread to die
}finally {
httpclient.close();
}
}
public static class IdleConnectionEvictor extends Thread {
private finalHttpClientConnectionManager connMgr;
private volatile boolean shutdown;// 多线程共享状态
publicIdleConnectionEvictor(HttpClientConnectionManager connMgr) {
super();
this.connMgr= connMgr;
}
@Override
public voidrun() {
try {
while (!shutdown){
synchronized (this) {
wait(5000);
// 关闭过期的连接
connMgr.closeExpiredConnections();
// 五秒之后,关闭空闲连接
connMgr.closeIdleConnections(5, TimeUnit.SECONDS);
}
}
}catch(InterruptedException ex) {
ex.printStackTrace();
}
}
public voidshutdown() {
shutdown = true;
synchronized (this) {
notifyAll();
}
}
}
}
运行如下:
--------------http://www.ctrip.com/---------------
HTTP/1.1 200 OK
-------------http://hotels.ctrip.com/-------------
HTTP/1.1 200 OK
--------------http://bus.ctrip.com/---------------
HTTP/1.1 200 OK
runProxyPools used 21117 ms.