java 页面编码_java获取页面编码

最新推荐文章于 2021-02-21 08:49:47 发布

SMS Parry

最新推荐文章于 2021-02-21 08:49:47 发布

阅读量242

点赞数

文章标签： java 页面编码

本文链接：https://blog.csdn.net/weixin_30067913/article/details/114070588

版权

import info.monitorenter.cpdetector.io.CodepageDetectorProxy;

import info.monitorenter.cpdetector.io.HTMLCodepageDetector;

import info.monitorenter.cpdetector.io.JChardetFacade;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.URL;

import java.util.Iterator;

import java.util.List;

import java.util.Map;

import java.util.Set;

public class WebEncoding {

private static CodepageDetectorProxy detector = CodepageDetectorProxy

.getInstance();

static {

detector.add(new HTMLCodepageDetector(false));

detector.add(JChardetFacade.getInstance());

}

/**测试用例

* @param args

public static void main(String[] args) {

WebEncoding web=new WebEncoding();

try {

System.out.println(web.getCharset("http://www.pujia.com/"));

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

/**

* @param strurl

* 页面url地址,需要以 http://开始，例：http://www.pujia.com

* @return

* @throws IOException

public String getCharset(String strurl) throws IOException {

// 定义URL对象

URL url = new URL(strurl);

// 获取http连接对象

HttpURLConnection urlConnection = (HttpURLConnection) url

.openConnection();

;

urlConnection.connect();

// 网页编码

String strencoding = null;

/**

* 首先根据header信息，判断页面编码

// map存放的是header信息(url页面的头信息)

Map> map = urlConnection.getHeaderFields();

Set keys = map.keySet();

Iterator iterator = keys.iterator();

// 遍历,查找字符编码

String key = null;

String tmp = null;

while (iterator.hasNext()) {

key = iterator.next();

tmp = map.get(key).toString().toLowerCase();

// 获取content-type charset

if (key != null && key.equals("Content-Type")) {

int m = tmp.indexOf("charset=");

if (m != -1) {

strencoding = tmp.substring(m + 8).replace("]", "");

return strencoding;

}

/**

* 通过解析meta得到网页编码

// 获取网页源码(英文字符和数字不会乱码，所以可以得到正确区域)

StringBuffer sb = new StringBuffer();

String line;

try {

BufferedReader in = new BufferedReader(new InputStreamReader(url

.openStream()));

while ((line = in.readLine()) != null) {

sb.append(line);

}

in.close();

} catch (Exception e) { // Report any errors that arise

System.err.println(e);

System.err

.println("Usage: java HttpClient []");

}

String htmlcode = sb.toString();

// 解析html源码，取出区域，并取出charset

String strbegin = "

String strend = ">";

String strtmp;

int begin = htmlcode.indexOf(strbegin);

int end = -1;

int inttmp;

while (begin > -1) {

end = htmlcode.substring(begin).indexOf(strend);

if (begin > -1 && end > -1) {

strtmp = htmlcode.substring(begin, begin + end).toLowerCase();

inttmp = strtmp.indexOf("charset");

if (inttmp > -1) {

strencoding = strtmp.substring(inttmp + 7, end).replace(

"=", "").replace("/", "").replace("\"", "")

.replace("\'", "").replace(" ", "");

return strencoding;

}

htmlcode = htmlcode.substring(begin);

begin = htmlcode.indexOf(strbegin);

}

/**

* 分析字节得到网页编码

strencoding = getFileEncoding(url);

// 设置默认网页字符编码

if (strencoding == null) {

strencoding = "GBK";

}

return strencoding;

}

/**

* 方法说明：通过网页内容识别网页编码

* 输入参数：strUrl 网页链接; timeout 超时设置

* 返回类型：网页编码

public static String getFileEncoding(URL url) {

java.nio.charset.Charset charset = null;

try {

charset = detector.detectCodepage(url);

} catch (Exception e) {

System.out.println(e.getClass() + "分析" + "编码失败");

}

if (charset != null)

return charset.name();

return null;

}

文章中用到的lib包，在附件中有可以下载。写出来，希望大家都happy，不用再为这个烦恼。

下载次数: 610

顶

踩

分享到：

2010-02-04 14:58

9 楼

1070482318

2015-12-09

测试链接：

http://www.jiuye.org/new/career/info/otherRec.html

http://yunnan.bidchance.com/

时无用，求楼主们看看

8 楼

虚客_zZ

2012-05-14

各位刚发错了， 7楼代码有问题。

改用现在这个吧。

strencoding = strtmp.substring(inttmp + 7, end).replace(

"=", "").replace("/", "").replace("\"", "")

.replace("\'", "").replace(" ", "");

替换为

String strChasetStart = "=";

String strChasetEnd = " ";

String strCharset = strtmp.substring(inttmp);

int charsetStart = strCharset.indexOf(strChasetStart);

int charsetEnd = strCharset.indexOf(strChasetEnd);

strencoding = strCharset.substring(charsetStart, charsetEnd).replace(

"=", "").replace("/", "").replace("\"", "").replace("\'", "").replace(" ", "");

7 楼

虚客_zZ

2012-05-14

strencoding = strtmp.substring(inttmp + 7, end).replace(

"=", "").replace("/", "").replace("\"", "")

.replace("\'", "").replace(" ", "");

替换为

String strChasetEnd = "\"";

String strCharset = strtmp.substring(inttmp);

int charsetEnd = strCharset.indexOf(strChasetEnd);

strencoding = strCharset.substring(7, charsetEnd).replace(

"=", "").replace("/", "").replace("\"", "").replace("\'", "").replace(" ", "");;

6 楼

虚客_zZ

2012-05-14

加入meta 为

有问题的，，编码抽取为utf-8http-equivcontent-type

原因楼主应该懂的

5 楼

philiphewxu

2012-03-25

再次调试发现是我自己程序其他地方的问题和库没有关系再次谢谢作者！

4 楼

philiphewxu

2012-03-25

朋友用了你的代码有些问题想请教一下就是又是读取编码的时间没法控制有时会卡在哪里非常的久用了detector.wait(10000)的函数又出现不停的出现“分析编码出错” 请问应该如何进行超时设置？

3 楼

vanxining

2011-07-19

不错~

2 楼

xiaoyangok

2011-03-14

1 楼

xiaoqing20

2010-07-29

大哥你是好人

SMS Parry

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java 页面编码_java获取页面编码

import info.monitorenter.cpdetector.io.CodepageDetectorProxy;import info.monitorenter.cpdetector.io.HTMLCodepageDetector;import info.monitorenter.cpdetector.io.JChardetFacade;import java.io.BufferedRe...
复制链接

扫一扫