使用java得到网页编码格式

package com.tag;



import java.net.MalformedURLException;

import java.net.URL;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HeaderElement;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.methods.GetMethod;

import toptrack.tools.JQueryBase;



/**

 * 得到网页编码格式

 * @author dl

 */

public class JHtmlUpdateCheck {

	/**文本内容编码识别类*/

	private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();  

	static {

		detector.add(new cpdetector.io.HTMLCodepageDetector(false)); 

		detector.add(cpdetector.io.JChardetFacade.getInstance());

	}



	/**

     *<br>方法说明:得到网页编码格式

     *<br>输入参数:strUrl 网页链接; timeout 超时设置

     *<br>返回类型:网页编码

     */

	public static String getEncoding(String strUrl, int timeout) {

		String strEncoding = null;

		HttpClient client = new HttpClient();

		client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout); 

	    GetMethod method = new GetMethod(strUrl);

	    method.setFollowRedirects( true );

	    int statusCode;

	    try {

			statusCode = client.executeMethod(method);

			if( statusCode != -1) {

				//从http头得到网页编码

	    		strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));

	    		if (strEncoding != null) {

	    			method.releaseConnection();

	    			return strEncoding;

	    		}

				//通过解析meta得到网页编码

		    	String strHtml = method.getResponseBodyAsString().toLowerCase();

		    	StringBuffer strBuffer = new StringBuffer();

		    	int pos = JQueryBase.getTagText(strHtml, "<meta", ">", strBuffer, false, 0);

		    	while (strBuffer.length() > 0) {

		    		StringBuffer strEncodingBuffer = new StringBuffer();

		    		JQueryBase.getTagText(strBuffer.toString(), "charset=", "/"", strEncodingBuffer, 0);

		    		if (strEncodingBuffer.length() > 0) {

		    			strEncoding = strEncodingBuffer.toString();

		    			method.releaseConnection();

		    			return strEncoding;

		    		}

		    		strBuffer = new StringBuffer();

		    		pos = JQueryBase.getTagText(strHtml, "<meta", ">", strBuffer, false, pos);

		    	}

		    	//分析字节得到网页编码

		    	strEncoding = getFileEncoding(strUrl, timeout);

		    	//设置默认网页字符编码

		    	if (strEncoding == null)

		    		strEncoding = "GBK"; 

		    }

			method.releaseConnection();

		} catch (Exception e) {

			// TODO Auto-generated catch block

			System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");

			return null;

		}

		

		return strEncoding;

	}



	/**

     *<br>方法说明:通过http头得到网页编码信息

     *<br>输入参数:contentheade rhttp头

     *<br>返回类型:网页编码

     */

	protected static String getContentCharSet(Header contentheader) {   

		String charset = null;   

		if (contentheader != null) {   

			HeaderElement values[] = contentheader.getElements();   

			if (values.length == 1) {   

				NameValuePair param = values[0].getParameterByName("charset");   

				if (param != null) {     

					charset = param.getValue();   

				}   

			}   

		}   

		return charset;   

	}   

	

	/**

     *<br>方法说明:通过网页内容识别网页编码

     *<br>输入参数:strUrl 网页链接; timeout 超时设置

     *<br>返回类型:网页编码

     */

	public static String getFileEncoding(String strUrl, int timeout) {

		java.nio.charset.Charset charset = null;  

		URL f;

		try {

			f = new URL(strUrl);

		} catch (MalformedURLException e) {

			// TODO Auto-generated catch block

			System.out.println(e.getClass() + strUrl + "无效");

			return null;

		}

		try {  

		      charset = detector.detectCodepage(f);  

		} catch (Exception e) {

			System.out.println(e.getClass() + "分析" + strUrl + "的编码失败");

		}  

		if (charset != null)

			return charset.name();

		return null;

	}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值