做了几个网页的数据抓取,一开始都是很零散的抓取,感觉重复的地方很多,就简单的封装了一下,包括http和https的解析,参照了一些网上的例子。为方便参考把所需导入的类也贴出来了,方便参考,如有不足的地方望指正,谢谢
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.security.KeyManagementException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.SSLContext;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Consts;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContexts;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HttpUtil {
private static int SocketTimeout = 100000;
private static int ConnectTimeout = 100000;
private static Boolean SetTimeOut = true;
private static String UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36";
private static Log logger = LogFactory.getLog(HttpUtil.class);
//创建httpclient实例
private static CloseableHttpClient httpClient = null;
/********单例模式声明开始********************/
//类初始化时,自行实例化,饿汉式单例模式
private static final HttpUtil httpClient1 = new HttpUtil();
private static final HttpUtil httpsClient = new HttpUtil("https");
public static HttpUtil getHttpClientInstance(){
return httpClient1;
}
public static HttpUtil getHttpsClientInstance(){
return httpsClient;
}
/**
* httpClient实例化
*/
private HttpUtil(){
initHttpClient();
}
/**
* httpsClient实例化
* @param param
*/
private HttpUtil(String param){
initHttpsClient();
}
/**
*
* 方法名:initHttpsClient
* 描述:创建httpClient连接池,并初始化httpsclient
*/
private void initHttpsClient(){
RegistryBuilder<ConnectionSocketFactory> registryBuilder = RegistryBuilder.<ConnectionSocketFactory>create();
ConnectionSocketFactory plainSF = new PlainConnectionSocketFactory();
registryBuilder.register("http", plainSF);
//指定信任密钥存储对象和连接套接字工厂
try {
KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
//信任任何链接
TrustStrategy anyTrustStrategy = new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
return true;
}
};
SSLContext sslContext = SSLContexts.custom().useTLS().loadTrustMaterial(trustStore, anyTrustStrategy).build();
LayeredConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(sslContext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
registryBuilder.register("https", sslSF);
} catch (KeyStoreException e) {
throw new RuntimeException(e);
} catch (KeyManagementException e) {
throw new RuntimeException(e);
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
Registry<ConnectionSocketFactory> registry = registryBuilder.build();
//设置连接管理器
PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(registry);
httpClient = HttpClientBuilder.create().disableRedirectHandling().setConnectionManager(connManager).setUserAgent(UserAgent).build();
}
/**
*
* 方法名:initHttpClient
* 描述:创建httpClient连接池,并初始化httpclient
*/
private void initHttpClient(){
//创建httpclient连接池
PoolingHttpClientConnectionManager httpClientConnectionManager = new PoolingHttpClientConnectionManager();
//httpClientConnectionManager.setMaxTotal(200); //设置连接池线程最大数量
//httpClientConnectionManager.setDefaultMaxPerRoute(httpClientConnectionManager.getMaxTotal()); //设置单个路由最大的连接线程数量
//创建http request的配置信息
RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(ConnectTimeout)
.setSocketTimeout(SocketTimeout).build();
//设置重定向策略
//LaxRedirectStrategy redirectStrategy = new LaxRedirectStrategy();
//httpClient = HttpClients.custom().setConnectionManager(httpClientConnectionManager)
//初始化httpclient客户端
httpClient = HttpClientBuilder.create().disableRedirectHandling().setConnectionManager(httpClientConnectionManager)
.setDefaultRequestConfig(requestConfig).setUserAgent(UserAgent).build();
}
/**
* get
*
* @param url 请求的url
* @param queries 请求的参数,在浏览器?后面的数据,没有可以传null
* @return
* @throws IOException
*/
public String get(String url, Map<String, String> queries) throws IOException {
String responseBody = "";
StringBuilder sb = new StringBuilder(url);
if (queries != null && queries.keySet().size() > 0) {
boolean firstFlag = true;
Iterator iterator = queries.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry entry = (Map.Entry<String, String>) iterator.next();
if (firstFlag) {
sb.append("?" + (String) entry.getKey() + "=" + (String) entry.getValue());
firstFlag = false;
} else {
sb.append("&" + (String) entry.getKey() + "=" + (String) entry.getValue());
}
}
}
HttpGet httpGet = new HttpGet(sb.toString());
if (SetTimeOut) {
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(SocketTimeout)
.setConnectTimeout(ConnectTimeout).build();//设置请求和传输超时时间
httpGet.setConfig(requestConfig);
}
try {
System.out.println("Executing request " + httpGet.getRequestLine());
//请求数据
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
//获取响应状态码
int statusCode = response.getStatusLine().getStatusCode();
System.out.println("status code =" +response.getStatusLine().getStatusCode());
InputStream inputStream = entity.getContent();
byte[] contentBytes = IOUtils.toByteArray(inputStream);
//根据响应状态码进行处理
switch (statusCode) {
case HttpStatus.SC_OK:
responseBody = getResult(contentBytes,entity);
break;
case HttpStatus.SC_MOVED_TEMPORARILY:
Header header = response.getFirstHeader("Location");
responseBody = header.getValue();
break;
default:
break;
}
} catch (Exception ex) {
ex.printStackTrace();
} finally {
httpGet.releaseConnection();
}
return responseBody;
}
/** post
* @param url 请求的url
* @param queries 请求的参数,在浏览器?后面的数据,没有可以传null
* @param params post form 提交的参数
* @return
* @throws IOException
*/
public String post(String url, Map<String, String> queries, Map<String, String> params) throws IOException {
String responseBody = "";
StringBuilder sb = new StringBuilder(url);
if (queries != null && queries.keySet().size() > 0) {
boolean firstFlag = true;
Iterator iterator = queries.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry entry = (Map.Entry<String, String>) iterator.next();
if (firstFlag) {
sb.append("?" + (String) entry.getKey() + "=" + (String) entry.getValue());
firstFlag = false;
} else {
sb.append("&" + (String) entry.getKey() + "=" + (String) entry.getValue());
}
}
}
//指定url,和http方式
HttpPost httpPost = new HttpPost(sb.toString());
if (SetTimeOut) {
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(SocketTimeout)
.setConnectTimeout(ConnectTimeout).build();//设置请求和传输超时时间
httpPost.setConfig(requestConfig);
}
//添加参数
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
if (params != null && params.keySet().size() > 0) {
Iterator<Map.Entry<String, String>> iterator = params.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, String> entry = (Map.Entry<String, String>) iterator.next();
nvps.add(new BasicNameValuePair((String) entry.getKey(), (String) entry.getValue()));
}
}
//设置表单提交编码为UTF-8
httpPost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8));
//请求数据
CloseableHttpResponse response = httpClient.execute(httpPost);
try {
HttpEntity entity = response.getEntity();
//获取响应状态码
int statusCode = response.getStatusLine().getStatusCode();
System.out.println("status code =" +response.getStatusLine().getStatusCode());
//inputstream转换成byte数组,然后将这个byte数组转成字符串
InputStream inputStream = entity.getContent();
byte[] contentBytes = IOUtils.toByteArray(inputStream);
//根据响应状态码进行处理
switch (statusCode) {
case HttpStatus.SC_OK:
//responseBody = new String(contentBytes, "UTF-8");
responseBody = getResult(contentBytes,entity);
break;
case HttpStatus.SC_MOVED_TEMPORARILY:
Header header = response.getFirstHeader("Location");
responseBody = header.getValue();
break;
default:
break;
}
/* if (null == header) {
responseBody = EntityUtils.toString(entity, "UTF-8");
} else {
responseBody = header.getValue();
}*/
//EntityUtils.consume(entity);
/* } else {
System.out.println("http return status error:" + response.getStatusLine().getStatusCode());
} */
} catch (Exception e) {
e.printStackTrace();
} finally {
httpPost.releaseConnection();
}
return responseBody;
}
/**
* @param src
* @param entity
* @return 然后解析字符串中的编码方式。再利用这种编码方式将之前的byte数组转成正确的网页字符串
* @throws UnsupportedEncodingException
*/
private String getResult(byte[] src,HttpEntity entity) throws UnsupportedEncodingException {
String responseBody = new String(src, "UTF-8");
//获得响应字符集编码
ContentType contentType = ContentType.getOrDefault(entity);
String charSet = null;
try {
Charset charset = contentType.getCharset();
if(charset != null){
charSet = charset.toString();
}
} catch (Exception e) {
e.printStackTrace();
}
//如果没有获取到字符编码则从meta标签中获取
if(charSet==null || charSet.equals("")){
//判断页面的编码方式
Document document = Jsoup.parse(responseBody);
Elements elements = document.select("meta");
for(Element metaElement : elements){
if(metaElement!=null && StringUtilEx.isNotBlank(metaElement.attr("http-equiv")) && metaElement.attr("http-equiv").toLowerCase().equals("content-type")){
String content = metaElement.attr("content");
charSet = getCharSet(content);
break;
}
}
}
//用获取的编码对contentBytes进行重新编码
if(!charSet.equalsIgnoreCase("utf-8")){
try {
responseBody = new String(src, charSet);
} catch (ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
return responseBody;
}
/**
*
* 方法名:getCharSet
* 描述:根据正则获取正文编码方式
* @param content
* @return
*/
private String getCharSet(String content){
String regex = ".*charset=([^;]*).*";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if(matcher.find())
return matcher.group(1);
else
return null;
}
}