利用cpdetector获取文件编码格式,同时得到网页内容。增加http/https通用方式

获取网页编码格式,同时得到网页内容。

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

public class HtmlContentUtil {
	private static CodepageDetectorProxy detector = null;
	
	static{//获取探测编码器detector对象
		detector = CodepageDetectorProxy.getInstance();
		detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar
		detector.add(new ParsingDetector(false));
		detector.add(ASCIIDetector.getInstance());
		detector.add(UnicodeDetector.getInstance());
	}
	
	/**
	 * @描述:  获取网页内容
	 * @说明:
	 * @修改时间: 2016年6月22日 下午3:16:55
	 * @param url
	 * @return
	 * @throws Exception
	 */
	public static String getContent(String url) throws Exception {
		if (!url.contains("http") && !url.contains("https")) {
			url = "http://" + url;
		}

		URL indexUrl = new URL(url);
		String fileEncode = getFileEncode(indexUrl);
		if(fileEncode == null){
			fileEncode = "utf-8";
		}
		HttpURLConnection httpConn = (HttpURLConnection) indexUrl.openConnection();
		InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), fileEncode);
		BufferedReader bufReader = new BufferedReader(input);
		String line = "";
		StringBuilder contentBuf = new StringBuilder();
		while ((line = bufReader.readLine()) != null) {
			contentBuf.append(line);
		}
		String content = contentBuf.toString();
		return content;
	}
	
	/**
	 * @描述:利用第三方开源包cpdetector获取文件编码格式
	 * @说明:
	 * @修改时间: 2016年6月22日 下午3:16:36
	 * @param indexUrl
	 * @return
	 */
	public static String getFileEncode(URL indexUrl) {
		java.nio.charset.Charset charset = null;
		try {
			charset = detector.detectCodepage(indexUrl);
		} catch (Exception ex) {
		}
		if (charset != null) {
			if (charset.name().equals("void")) {
				return "GBK";//未知的编码默认为gbk
			}else{
				return charset.name();
			}
		} else{
			return null;
		}
	}
	
	public static void main(String[] args) {
		try {
			System.out.println(getContent("www.xjjz.gov.cn"));
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		
	}
}

以上只能获取http协议网站内容,增加一个http、https都能获取的方式

注册http客户端

 

/**
	 * 创建httpclient
	 * @return
	 */
	public CloseableHttpClient buildHttpClient() {
		try {
			RegistryBuilder<ConnectionSocketFactory> builder = RegistryBuilder.create();
			ConnectionSocketFactory factory = new PlainConnectionSocketFactory();
			builder.register("http", factory);
			KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
			SSLContext context = SSLContexts.custom().useTLS().loadTrustMaterial(trustStore, new TrustStrategy() {
				@Override
				public boolean isTrusted(X509Certificate[] chain, String authType)
						throws CertificateException {
					return true;
				}
			}).build();
			LayeredConnectionSocketFactory sslFactory = new SSLConnectionSocketFactory(context, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
			builder.register("https", sslFactory);
			Registry<ConnectionSocketFactory> registry = builder.build();
			
			PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager(registry);
			ConnectionConfig connConfig = ConnectionConfig.custom().setCharset(Charset.forName(defaultEncoding)).build();
			SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(100000).build();
			manager.setDefaultConnectionConfig(connConfig);
			manager.setDefaultSocketConfig(socketConfig);
			return HttpClientBuilder.create().setConnectionManager(manager).build();
		} catch (KeyStoreException e) {
			e.printStackTrace();
		} catch (KeyManagementException e) {
			e.printStackTrace();
		} catch (NoSuchAlgorithmException e) {
			e.printStackTrace();
		}
		return null;
	}

 

/**
	 * @描述:  获取网页内容,支持http和https
	 * @说明:
	 * @修改时间: 2016年7月4日 上午10:20:27
	 * @param url
	 * @return
	 * @throws IOException 
	 * @throws ClientProtocolException 
	 */
	public static String getAllContent(String url) throws ClientProtocolException, IOException {
		if (!url.contains("http") && !url.contains("https")) {
			url = "http://" + url;
		}
		String fileEncode = getFileEncode(new URL(url));
		if(fileEncode == null){
			fileEncode = "utf-8";
		}
		CloseableHttpClient buildHttpClient = new HttpUtils().buildHttpClient();
		//RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setConnectionRequestTimeout(3000).setSocketTimeout(3000).build();
		HttpGet httpGet = new HttpGet(url);
		//httpGet.setConfig(requestConfig);
		CloseableHttpResponse response = buildHttpClient.execute(httpGet);

		HttpEntity entity = response.getEntity();
		String result = "";
		
		if (entity != null) {
			result = EntityUtils.toString(entity, Charset.forName(fileEncode));
		}

		return result;
	}

 

 

 

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值