使用java得到网页编码格式

最新推荐文章于 2021-06-07 11:22:46 发布

dongle2001

最新推荐文章于 2021-06-07 11:22:46 发布

阅读量2.7k

点赞数

分类专栏： java 文章标签： java string null exception header url

本文链接：https://blog.csdn.net/dongle2001/article/details/2557469

版权

java 专栏收录该内容

30 篇文章 0 订阅

订阅专栏

package com.tag;



import java.net.MalformedURLException;

import java.net.URL;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HeaderElement;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.methods.GetMethod;

import toptrack.tools.JQueryBase;



/**

 * 得到网页编码格式

 * @author dl

 */

public class JHtmlUpdateCheck {

	/**文本内容编码识别类*/

	private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();  

	static {

		detector.add(new cpdetector.io.HTMLCodepageDetector(false)); 

		detector.add(cpdetector.io.JChardetFacade.getInstance());

	}



	/**

     *<br>方法说明：得到网页编码格式

     *<br>输入参数：strUrl 网页链接; timeout 超时设置

     *<br>返回类型：网页编码

     */

	public static String getEncoding(String strUrl, int timeout) {

		String strEncoding = null;

		HttpClient client = new HttpClient();

		client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout); 

	    GetMethod method = new GetMethod(strUrl);

	    method.setFollowRedirects( true );

	    int statusCode;

	    try {

			statusCode = client.executeMethod(method);

			if( statusCode != -1) {

				//从http头得到网页编码

	    		strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));

	    		if (strEncoding != null) {

	    			method.releaseConnection();

	    			return strEncoding;

	    		}

				//通过解析meta得到网页编码

		    	String strHtml = method.getResponseBodyAsString().toLowerCase();

		    	StringBuffer strBuffer = new StringBuffer();

		    	int pos = JQueryBase.getTagText(strHtml, "<meta", ">", strBuffer, false, 0);

		    	while (strBuffer.length() > 0) {

		    		StringBuffer strEncodingBuffer = new StringBuffer();

		    		JQueryBase.getTagText(strBuffer.toString(), "charset=", "/"", strEncodingBuffer, 0);

		    		if (strEncodingBuffer.length() > 0) {

		    			strEncoding = strEncodingBuffer.toString();

		    			method.releaseConnection();

		    			return strEncoding;

		    		}

		    		strBuffer = new StringBuffer();

		    		pos = JQueryBase.getTagText(strHtml, "<meta", ">", strBuffer, false, pos);

		    	}

		    	//分析字节得到网页编码

		    	strEncoding = getFileEncoding(strUrl, timeout);

		    	//设置默认网页字符编码

		    	if (strEncoding == null)

		    		strEncoding = "GBK"; 

		    }

			method.releaseConnection();

		} catch (Exception e) {

			// TODO Auto-generated catch block

			System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");

			return null;

		}

		

		return strEncoding;

	}



	/**

     *<br>方法说明：通过http头得到网页编码信息

     *<br>输入参数：contentheade rhttp头

     *<br>返回类型：网页编码

     */

	protected static String getContentCharSet(Header contentheader) {   

		String charset = null;   

		if (contentheader != null) {   

			HeaderElement values[] = contentheader.getElements();   

			if (values.length == 1) {   

				NameValuePair param = values[0].getParameterByName("charset");   

				if (param != null) {     

					charset = param.getValue();   

				}   

			}   

		}   

		return charset;   

	}   

	

	/**

     *<br>方法说明：通过网页内容识别网页编码

     *<br>输入参数：strUrl 网页链接; timeout 超时设置

     *<br>返回类型：网页编码

     */

	public static String getFileEncoding(String strUrl, int timeout) {

		java.nio.charset.Charset charset = null;  

		URL f;

		try {

			f = new URL(strUrl);

		} catch (MalformedURLException e) {

			// TODO Auto-generated catch block

			System.out.println(e.getClass() + strUrl + "无效");

			return null;

		}

		try {  

		      charset = detector.detectCodepage(f);  

		} catch (Exception e) {

			System.out.println(e.getClass() + "分析" + strUrl + "的编码失败");

		}  

		if (charset != null)

			return charset.name();

		return null;

	}

}

dongle2001

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
5
评论
使用java得到网页编码格式

package com.tag;import java.net.MalformedURLException;import java.net.URL;import org.apache.commons.httpclient.Header;import org.apache.commons.httpclient.HeaderElement;import
复制链接

扫一扫