现象
这两天在用java写一个爬虫,需要使用代理IP访问目标网页。可是在httpClinet执行了execute()方法时,有时会一直停滞,设置的超时参数无效。这样程序无法运行,一直停在execute()方法。可以确定的是,所有的超时参数都已经设置。
部分代码
解决方案
查了半天也不知道是我写的有问题,还是HttpClinet4.5有问题。所以,找了一个用定时器的办法。每次request()之前起一个定时器,设置时间为SocketTimeout+ConnectTimeout。
public class IPHttp {
private static ReleaseIdleConnTask timeTask = null;
private static CloseableHttpClient client = null;
private static int Timeout = 5000;
private static HttpRequestRetryHandler retryHandler = new HttpRequestRetryHandler() {
@Override
public boolean retryRequest(IOException exception, int executionCount, HttpContext context){
return false;
}
};
public static String getHtml(String url, String ip, String port) {
String entity = null;
if (StringUtil.isNull(url) || url.length() < 5) {
return entity;
}
if (isHttps(url)) {
client = createHTTPSClinet();
} else {
client = HttpClients.custom().setRetryHandler(retryHandler).build();
}
if (client == null) {
return entity;
}
RequestConfig config = RequestConfig.custom().setConnectTimeout(Timeout).setSocketTimeout(Timeout).setConnectionRequestTimeout(Timeout).build();
// 设置代理访问和超时处理
if (!StringUtil.isNull(ip) && !StringUtil.isNull(port)) {
HttpHost proxy = new HttpHost(ip, Integer.parseInt(port));
config = RequestConfig.custom().setConnectTimeout(Timeout).setSocketTimeout(Timeout).setConnectionRequestTimeout(Timeout).setProxy(proxy).build();
}
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(config);
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;" +
"q=0.9,image/webp,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpGet.setHeader("Cache-Control", "no-cache");
httpGet.setHeader("Connection", "close");
httpGet.setHeader("Pragma", "no-cache");
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
// 计时器
Timer timer = new Timer();
timeTask = new ReleaseIdleConnTask(client);
timer.schedule(timeTask, Timeout * 2);
try {
//客户端执行httpGet方法,返回响应
CloseableHttpResponse httpResponse = client.execute(httpGet);
//得到服务响应状态码
if (httpResponse.getStatusLine().getStatusCode() == 200) {
entity = EntityUtils.toString(httpResponse.getEntity(), "utf-8");
}
} catch (ClientProtocolException e) {
System.out.println(e.getMessage());
entity = null;
} catch (IOException e) {
System.out.println(e.getMessage());
entity = null;
} finally {
// 清除定时器
timer.cancel();
try {
client.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
}
return entity;
}
private static boolean isHttps(String url) {
return url.toUpperCase().startsWith("HTTPS");
}
/**
* 创建HTTPS类型的HttpClinet
* @return
*/
private static CloseableHttpClient createHTTPSClinet() {
SSLConnectionSocketFactory sslsf = null;
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial( null, new TrustStrategy() {
// 信任所有
public boolean isTrusted(X509Certificate[] chain,
String authType) throws CertificateException {
return true;
}
}).build();
sslsf = new SSLConnectionSocketFactory(sslContext);
} catch (GeneralSecurityException e) {
e.printStackTrace();
}
return HttpClients.custom().setSSLSocketFactory(sslsf).setRetryHandler(retryHandler).build();
}
}
/**
* 定时任务
*/
class ReleaseIdleConnTask extends TimerTask
{
public ReleaseIdleConnTask(CloseableHttpClient client)
{
httpClient = client;
}
@Override
public void run() {
try {
System.out.println("HttpClinet Connect Cost Too Much Time.Killing It!");
httpClient.close();
} catch (IOException e) {
System.out.println("Kill Failed!" + e.getMessage());
}
}
private CloseableHttpClient httpClient = null;
}
通过增加定时器,如果execute()长时间不返回的话,就会通过定时器清除httpGet对象。这样就可以保证程序正常运行。