importjava.io.BufferedReader;importjava.io.IOException;importjava.io.InputStreamReader;importjava.net.HttpURLConnection;importjava.net.URL;importjava.util.Iterator;importjava.util.List;importjava.util.Map;importjava.util.Set;importcpdetector.io.CodepageDetectorProxy;importcpdetector.io.HTMLCodepageDetector;importcpdetector.io.JChardetFacade;public classPageEncodeDetector {private static CodepageDetectorProxy detector =CodepageDetectorProxy
.getInstance();static{
detector.add(new HTMLCodepageDetector(false));
detector.add(JChardetFacade.getInstance());
}/*** 测试用例
*
*@paramargs*/
public static voidmain(String[] args) {
PageEncodeDetector web= newPageEncodeDetector();try{
System.out.println(web.getCharset("http://www.baidu.com/"));
}catch(IOException e) {//TODO Auto-generated catch block
e.printStackTrace();
}
}/***@paramstrurl
* 页面url地址,需要以 http://开始,例:http://www.pujia.com*@return*@throwsIOException*/
public String getCharset(String strurl) throwsIOException {//定义URL对象
URL url = newURL(strurl);//获取http连接对象
HttpURLConnection urlConnection =(HttpURLConnection) url
.openConnection();
;
urlConnection.connect();//网页编码
String strencoding = null;/*** 首先根据header信息,判断页面编码*/
//map存放的是header信息(url页面的头信息)
Map> map =urlConnection.getHeaderFields();
Set keys =map.keySet();
Iterator iterator =keys.iterator();//遍历,查找字符编码
String key = null;
String tmp= null;while(iterator.hasNext()) {
key=iterator.next();
tmp=map.get(key).toString().toLowerCase();//获取content-type charset
if (key != null && key.equals("Content-Type")) {int m = tmp.indexOf("charset=");if (m != -1) {
strencoding= tmp.substring(m + 8).replace("]", "");returnstrencoding;
}
}
}/*** 通过解析meta得到网页编码*/
//获取网页源码(英文字符和数字不会乱码,所以可以得到正确区域)
StringBuffer sb = newStringBuffer();
String line;try{
BufferedReader in= new BufferedReader(newInputStreamReader(
url.openStream()));while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
}catch (Exception e) { //Report any errors that arise
System.err.println(e);
System.err
.println("Usage: java HttpClient []");
}
String htmlcode=sb.toString();//解析html源码,取出区域,并取出charset
String strbegin = "
String strend= ">";
String strtmp;int begin =htmlcode.indexOf(strbegin);int end = -1;intinttmp;while (begin > -1) {
end=htmlcode.substring(begin).indexOf(strend);if (begin > -1 && end > -1) {
strtmp= htmlcode.substring(begin, begin +end).toLowerCase();
inttmp= strtmp.indexOf("charset");if (inttmp > -1) {
strencoding= strtmp.substring(inttmp + 7, end)
.replace("=", "").replace("/", "")
.replace("\"", "").replace("\'", "")
.replace(" ", "");returnstrencoding;
}
}
htmlcode=htmlcode.substring(begin);
begin=htmlcode.indexOf(strbegin);
}/*** 分析字节得到网页编码*/strencoding=getFileEncoding(url);//设置默认网页字符编码
if (strencoding == null) {
strencoding= "GBK";
}returnstrencoding;
}/***
*
* 方法说明:通过网页内容识别网页编码
*
*
* 输入参数:strUrl 网页链接; timeout 超时设置
*
*
* 返回类型:网页编码*/
public staticString getFileEncoding(URL url) {
java.nio.charset.Charset charset= null;try{
charset=detector.detectCodepage(url);
}catch(Exception e) {
System.out.println(e.getClass()+ "分析" + "编码失败");
}if (charset != null)returncharset.name();return null;
}
}