java 页面编码_java获取页面编码

小软观察

于 2021-02-12 19:35:07 发布

阅读量114

点赞数

文章标签： Java 字符编码 HTTP连接 HTML解析编码检测

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/weixin_42537831/article/details/114070585

版权

importjava.io.BufferedReader;importjava.io.IOException;importjava.io.InputStreamReader;importjava.net.HttpURLConnection;importjava.net.URL;importjava.util.Iterator;importjava.util.List;importjava.util.Map;importjava.util.Set;importcpdetector.io.CodepageDetectorProxy;importcpdetector.io.HTMLCodepageDetector;importcpdetector.io.JChardetFacade;public classPageEncodeDetector {private static CodepageDetectorProxy detector =CodepageDetectorProxy

.getInstance();static{

detector.add(new HTMLCodepageDetector(false));

detector.add(JChardetFacade.getInstance());

}/*** 测试用例

*

*@paramargs*/

public static voidmain(String[] args) {

PageEncodeDetector web= newPageEncodeDetector();try{

System.out.println(web.getCharset("http://www.baidu.com/"));

}catch(IOException e) {//TODO Auto-generated catch block

e.printStackTrace();

}

}/***@paramstrurl

* 页面url地址,需要以 http://开始，例：http://www.pujia.com*@return*@throwsIOException*/

public String getCharset(String strurl) throwsIOException {//定义URL对象

URL url = newURL(strurl);//获取http连接对象

HttpURLConnection urlConnection =(HttpURLConnection) url

.openConnection();

;

urlConnection.connect();//网页编码

String strencoding = null;/*** 首先根据header信息，判断页面编码*/

//map存放的是header信息(url页面的头信息)

Map> map =urlConnection.getHeaderFields();

Set keys =map.keySet();

Iterator iterator =keys.iterator();//遍历,查找字符编码

String key = null;

String tmp= null;while(iterator.hasNext()) {

key=iterator.next();

tmp=map.get(key).toString().toLowerCase();//获取content-type charset

if (key != null && key.equals("Content-Type")) {int m = tmp.indexOf("charset=");if (m != -1) {

strencoding= tmp.substring(m + 8).replace("]", "");returnstrencoding;

}

}

}/*** 通过解析meta得到网页编码*/

//获取网页源码(英文字符和数字不会乱码，所以可以得到正确区域)

StringBuffer sb = newStringBuffer();

String line;try{

BufferedReader in= new BufferedReader(newInputStreamReader(

url.openStream()));while ((line = in.readLine()) != null) {

sb.append(line);

}

in.close();

}catch (Exception e) { //Report any errors that arise

System.err.println(e);

System.err

.println("Usage: java HttpClient []");

}

String htmlcode=sb.toString();//解析html源码，取出区域，并取出charset

String strbegin = "

String strend= ">";

String strtmp;int begin =htmlcode.indexOf(strbegin);int end = -1;intinttmp;while (begin > -1) {

end=htmlcode.substring(begin).indexOf(strend);if (begin > -1 && end > -1) {

strtmp= htmlcode.substring(begin, begin +end).toLowerCase();

inttmp= strtmp.indexOf("charset");if (inttmp > -1) {

strencoding= strtmp.substring(inttmp + 7, end)

.replace("=", "").replace("/", "")

.replace("\"", "").replace("\'", "")

.replace(" ", "");returnstrencoding;

}

}

htmlcode=htmlcode.substring(begin);

begin=htmlcode.indexOf(strbegin);

}/*** 分析字节得到网页编码*/strencoding=getFileEncoding(url);//设置默认网页字符编码

if (strencoding == null) {

strencoding= "GBK";

}returnstrencoding;

}/***

*

* 方法说明：通过网页内容识别网页编码

*

*

* 输入参数：strUrl 网页链接; timeout 超时设置

*

*

* 返回类型：网页编码*/

public staticString getFileEncoding(URL url) {

java.nio.charset.Charset charset= null;try{

charset=detector.detectCodepage(url);

}catch(Exception e) {

System.out.println(e.getClass()+ "分析" + "编码失败");

}if (charset != null)returncharset.name();return null;

}

}

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。