/*
* EncodeGoter.java
*
* Created on 2007年9月30日, 下午4:49
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
package com.ckcs.url;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author admin
*/
public class EncodeGoter {
/**
* Creates a new instance of EncodeGoter
*/
public EncodeGoter() {
}
/**
* 获得页面的字符编码
*/
private String getEncode(String size) throws Exception{
URL url = new URL(size);
String charset = null;
Pattern pattern = Pattern.compile("charset.*=.*>?", Pattern.CASE_INSENSITIVE);
URLConnection con = url.openConnection();
String contentType = con.getContentType(); //先尝试从http响应头获取字符编码
charset = doGetEncode(pattern, contentType);
if(charset == null) { //如果得不到,尝试从页面的元数据信息上获取
InputStream is = url.openStream();
BufferedInputStream bis = new BufferedInputStream(is);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
int count = 0;
byte[] bytes = new byte[1024];
while((count = bis.read(bytes)) != -1) { //每次读1024把字符截断了怎么办
bos.write(bytes, 0, count);
bos.flush();
charset = doGetEncode(pattern, bos.toString());
if(charset != null) { //找到编码
break;
}
bos.reset();
}
}
return charset;
}
/**
* 读取页面数据匹配模式
*/
private String doGetEncode(Pattern pattern, String str) throws Exception{
Matcher matcher = null;
String matchStr = null;
String charset = null;
matcher = pattern.matcher(str);
if(matcher.find()) { //找到第一个符合要求的
matchStr = matcher.group();
//截取希望处理的字符串,替换可能的特殊符号
charset = matchStr.substring(matchStr.indexOf("=") + 1).replaceAll("["|/|/|/s].*[/>|>]", "");
}
return charset;
}
public static void main(String[] args) throws Exception {
EncodeGoter eg = new EncodeGoter();
// eg.getEncode("http://java.sun.com");
// eg.getEncode("http://www.sun.com");
// eg.getEncode("http://www.csdn.net");
// eg.getEncode("http://www.dmoz.org/");
// eg.getEncode("http://www.baidu.com/search/image_recommend.html");
String charset = eg.getEncode("http://java.sun.com");
if (charset != null) {
System.out.println("页面的字符编码应该为:" + charset);
} else {
charset = Charset.defaultCharset().toString(); //使用默认编码
System.out.println("找不到页面字符编码,平台默认编码为:" + charset);
}
}
}
* EncodeGoter.java
*
* Created on 2007年9月30日, 下午4:49
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
package com.ckcs.url;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author admin
*/
public class EncodeGoter {
/**
* Creates a new instance of EncodeGoter
*/
public EncodeGoter() {
}
/**
* 获得页面的字符编码
*/
private String getEncode(String size) throws Exception{
URL url = new URL(size);
String charset = null;
Pattern pattern = Pattern.compile("charset.*=.*>?", Pattern.CASE_INSENSITIVE);
URLConnection con = url.openConnection();
String contentType = con.getContentType(); //先尝试从http响应头获取字符编码
charset = doGetEncode(pattern, contentType);
if(charset == null) { //如果得不到,尝试从页面的元数据信息上获取
InputStream is = url.openStream();
BufferedInputStream bis = new BufferedInputStream(is);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
int count = 0;
byte[] bytes = new byte[1024];
while((count = bis.read(bytes)) != -1) { //每次读1024把字符截断了怎么办
bos.write(bytes, 0, count);
bos.flush();
charset = doGetEncode(pattern, bos.toString());
if(charset != null) { //找到编码
break;
}
bos.reset();
}
}
return charset;
}
/**
* 读取页面数据匹配模式
*/
private String doGetEncode(Pattern pattern, String str) throws Exception{
Matcher matcher = null;
String matchStr = null;
String charset = null;
matcher = pattern.matcher(str);
if(matcher.find()) { //找到第一个符合要求的
matchStr = matcher.group();
//截取希望处理的字符串,替换可能的特殊符号
charset = matchStr.substring(matchStr.indexOf("=") + 1).replaceAll("["|/|/|/s].*[/>|>]", "");
}
return charset;
}
public static void main(String[] args) throws Exception {
EncodeGoter eg = new EncodeGoter();
// eg.getEncode("http://java.sun.com");
// eg.getEncode("http://www.sun.com");
// eg.getEncode("http://www.csdn.net");
// eg.getEncode("http://www.dmoz.org/");
// eg.getEncode("http://www.baidu.com/search/image_recommend.html");
String charset = eg.getEncode("http://java.sun.com");
if (charset != null) {
System.out.println("页面的字符编码应该为:" + charset);
} else {
charset = Charset.defaultCharset().toString(); //使用默认编码
System.out.println("找不到页面字符编码,平台默认编码为:" + charset);
}
}
}