package com.poi.examples;
/** *//**
* <p>
* Title: LoonFramework
* </p>
* <p>
* Description:编码基本类型集合
* </p>
* <p>
* Copyright: Copyright (c) 2008
* </p>
* <p>
* Company: LoonFramework
* </p>
* <p>
* License: http://www.apache.org/licenses/LICENSE-2.0
* </p>
*
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class Encoding {
// 支持的字符格式
public static int GB2312 = 0;
public static int GBK = 1;
public static int BIG5 = 2;
public static int UTF8 = 3;
public static int UNICODE = 4;
public static int EUC_KR = 5;
public static int SJIS = 6;
public static int EUC_JP = 7;
public static int ASCII = 8;
public static int UNKNOWN = 9;
public static int TOTALT = 10;
public final static int SIMP = 0;
public final static int TRAD = 1;
// 解析名称用
public static String[] javaname;
// 编码用
public static String[] nicename;
// 应用于html中的字符集
public static String[] htmlname;
public Encoding() {
javaname = new String[TOTALT];
nicename = new String[TOTALT];
htmlname = new String[TOTALT];
javaname[GB2312] = "GB2312";
javaname[GBK] = "GBK";
javaname[BIG5] = "BIG5";
javaname[UTF8] = "UTF8";
javaname[UNICODE] = "Unicode";
javaname[EUC_KR] = "EUC_KR";
javaname[SJIS] = "SJIS";
javaname[EUC_JP] = "EUC_JP";
javaname[ASCII] = "ASCII";
javaname[UNKNOWN] = "ISO8859_1";
// 分配编码名称
htmlname[GB2312] = "GB2312";
htmlname[GBK] = "GBK";
htmlname[BIG5] = "BIG5";
htmlname[UTF8] = "UTF-8";
htmlname[UNICODE] = "UTF-16";
htmlname[EUC_KR] = "EUC-KR";
htmlname[SJIS] = "Shift_JIS";
htmlname[EUC_JP] = "EUC-JP";
htmlname[ASCII] = "ASCII";
htmlname[UNKNOWN] = "ISO8859-1";
// 分配可读名称
nicename[GB2312] = "GB-2312";
nicename[GBK] = "GBK";
nicename[BIG5] = "Big5";
nicename[UTF8] = "UTF-8";
nicename[UNICODE] = "Unicode";
nicename[EUC_KR] = "EUC-KR";
nicename[SJIS] = "Shift-JIS";
nicename[EUC_JP] = "EUC-JP";
nicename[ASCII] = "ASCII";
nicename[UNKNOWN] = "UNKNOWN";
}
public String toEncoding(final int type) {
return (javaname[type] + "," + nicename[type] + "," + htmlname[type])
.intern();
}
}
package com.poi.examples;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
public class ParseEncoding extends Encoding {
int[][] GB2312format = new int[94][94];
int[][] GBKformat = new int[126][191];
int[][] Big5format = new int[94][158];
int[][] EUC_KRformat = new int[94][94];
int[][] JPformat = new int[94][94];
public ParseEncoding() {
super();
// 初始化编码格式
// init();
}
public String getEncoding(final String path) {
return check(getEncodeValue(path));
}
public String getEncoding(final InputStream in) {
return check(getEncodeValue(in));
}
public String getEncoding(final byte[] buffer) {
return check(getEncodeValue(buffer));
}
public String getEncoding(final URL url) {
return check(getEncodeValue(url));
}
private String check(final int result) {
if (result == -1) {
return nicename[UNKNOWN];
}
return nicename[result];
}
/** */
/**
* 解析指定字符串路径编码所用格式
*
* @param path
* @return
*/
private int getEncodeValue(String path) {
int express = UNKNOWN;
if (path.startsWith("http://")) {
try {
express = getEncodeValue(new URL(path));
} catch (MalformedURLException e) {
express = -1;
}
} else {
express = getEncodeValue(new File(path));
}
return express;
}
/** */
/**
*
* 解析指定InputStream所用编码,返回或然率最高的编码类型数值
*
* @param in
* @return
*/
public int getEncodeValue(InputStream in) {
byte[] rawtext = new byte[8192];
int bytesread = 0, byteoffset = 0;
int express = UNKNOWN;
InputStream stream = in;
try {
while ((bytesread = stream.read(rawtext, byteoffset, rawtext.length
- byteoffset)) > 0) {
byteoffset += bytesread;
}
;
stream.close();
express = getEncodeValue(rawtext);
} catch (Exception e) {
express = -1;
}
return express;
}
/** */
/**
* 解析指定url下数据所用编码,返回或然率最高的编码类型数值
*
* @param url
* @return
*/
public int getEncodeValue(URL url) {
InputStream stream;
try {
stream = url.openStream();
} catch (IOException e) {
stream = null;
}
return getEncodeValue(stream);
}
/** */
/**
* 解析指定file所用编码,返回或然率最高的编码类型数值
*
* @param file
* @return
*/
public int getEncodeValue(File file) {
byte[] buffer;
try {
buffer = read(new FileInputStream(file));
} catch (FileNotFoundException e) {
buffer = null;
}
return getEncodeValue(buffer);
}
/** */
/**
* 将inputstream转为byte[]
*
* @param inputStream
* @return
*/
private final byte[] read(final InputStream inputStream) {
byte[] arrayByte = null;
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
byte[] bytes = new byte[8192];
try {
bytes = new byte[inputStream.available()];
int read;
while ((read = inputStream.read(bytes)) >= 0) {
byteArrayOutputStream.write(bytes, 0, read);
}
arrayByte = byteArrayOutputStream.toByteArray();
} catch (IOException e) {
return null;
}
return arrayByte;
}
/** */
/**
* 解析指定byte[]所用编码,返回或然率最高的数值类型
*
* @param content
* @return
*/
public int getEncodeValue(byte[] content) {
if (content == null)
return -1;
int[] scores;
int index, maxscore = 0;
int encoding = UNKNOWN;
scores = new int[TOTALT];
// 分配或然率
scores[GB2312] = gb2312probability(content);
scores[GBK] = gbkprobability(content);
scores[BIG5] = big5probability(content);
scores[UTF8] = utf8probability(content);
scores[UNICODE] = utf16probability(content);
scores[EUC_KR] = euc_krprobability(content);
scores[ASCII] = asciiprobability(content);
scores[SJIS] = sjisprobability(content);
scores[EUC_JP] = euc_jpprobability(content);
scores[UNKNOWN] = 0;
// 概率比较
for (index = 0; index < TOTALT; index++) {
if (scores[index] > maxscore) {
// 索引
encoding = index;
// 最大几率
maxscore = scores[index];
}
}
// 返回或然率大于50%的数据
if (maxscore <= 50) {
encoding = UNKNOWN;
}
return encoding;
}
/** */
/**
* gb2312数据或然率计算
*
* @param content
* @return
*/
private int gb2312probability(byte[] content) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;
// 检查是否在亚洲汉字范围内
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (content[i] >= 0) {
} else {
dbchars++;
// 汉字GB码由两个字节组成,每个字节的范围是0xA1 ~ 0xFE
if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xF7
&& (byte) 0xA1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xFE) {
gbchars++;
totalformat += 500;
row = content[i] + 256 - 0xA1;
column = content[i + 1] + 256 - 0xA1;
if (GB2312format[row][column] != 0) {
gbformat += GB2312format[row][column];
} else if (15 <= row && row < 55) {
// 在gb编码范围
gbformat += 200;
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
formatval = 50 * ((float) gbformat / (float) totalformat);
return (int) (rangeval + formatval);
}
/** */
/**
* gb2312或然率计算
*
* @param content
* @return
*/
private int gbkprobability(byte[] content) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (content[i] >= 0) {
} else {
dbchars++;
if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xF7
&& // gb范围
(byte) 0xA1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xFE) {
gbchars++;
totalformat += 500;
row = content[i] + 256 - 0xA1;
column = content[i + 1] + 256 - 0xA1;
if (GB2312format[row][column] != 0) {
gbformat += GB2312format[row][column];
} else if (15 <= row && row < 55) {
gbformat += 200;
}
} else if ((byte) 0x81 <= content[i]
&& content[i] <= (byte) 0xFE && // gb扩展区域
(((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E))) {
gbchars++;
totalformat += 500;
row = content[i] + 256 - 0x81;
if (0x40 <= content[i + 1] && content[i + 1] <= 0x7E) {
column = content[i + 1] - 0x40;
} else {
column = content[i + 1] + 256 - 0x40;
}
if (GBKformat[row][column] != 0) {
gbformat += GBKformat[row][column];
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
formatval = 50 * ((float) gbformat / (float) totalformat);
return (int) (rangeval + formatval) - 1;
}
/** */
/**
* 解析为big5的或然率
*
* @param content
* @return
*/
private int big5probability(byte[] content) {
int i, rawtextlen = 0;
int dbchars = 1, bfchars = 1;
float rangeval = 0, formatval = 0;
long bfformat = 0, totalformat = 1;
int row, column;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (content[i] >= 0) {
} else {
dbchars++;
if ((byte) 0xA1 <= content[i]
&& content[i] <= (byte) 0xF9
&& (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= content[i + 1] && content[i + 1] <= (byte) 0xFE))) {
bfchars++;
totalformat += 500;
row = content[i] + 256 - 0xA1;
if (0x40 <= content[i + 1] && content[i + 1] <= 0x7E) {
column = content[i + 1] - 0x40;
} else {
column = content[i + 1] + 256 - 0x61;
}
if (Big5format[row][column] != 0) {
bfformat += Big5format[row][column];
} else if (3 <= row && row <= 37) {
bfformat += 200;
}
}
i++;
}
}
rangeval = 50 * ((float) bfchars / (float) dbchars);
formatval = 50 * ((float) bfformat / (float) totalformat);
return (int) (rangeval + formatval);
}
/** */
/**
* 在utf-8中的或然率
*
* @param content
* @return
*/
private int utf8probability(byte[] content) {
int score = 0;
int i, rawtextlen = 0;
int goodbytes = 0, asciibytes = 0;
// 检查是否为汉字可接受范围
rawtextlen = content.length;
for (i = 0; i < rawtextlen; i++) {
if ((content[i] & (byte) 0x7F) == content[i]) {
asciibytes++;
} else if (-64 <= content[i] && content[i] <= -33
&& i + 1 < rawtextlen && -128 <= content[i + 1]
&& content[i + 1] <= -65) {
goodbytes += 2;
i++;
} else if (-32 <= content[i] && content[i] <= -17
&& i + 2 < rawtextlen && -128 <= content[i + 1]
&& content[i + 1] <= -65 && -128 <= content[i + 2]
&& content[i + 2] <= -65) {
goodbytes += 3;
i += 2;
}
}
if (asciibytes == rawtextlen) {
return 0;
}
score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));
// 如果不高于98则减少到零
if (score > 98) {
return score;
} else if (score > 95 && goodbytes > 30) {
return score;
} else {
return 0;
}
}
/** */
/**
* 检查为utf-16的或然率
*
* @param content
* @return
*/
private int utf16probability(byte[] content) {
if (content.length > 1
&& ((byte) 0xFE == content[0] && (byte) 0xFF == content[1])
|| ((byte) 0xFF == content[0] && (byte) 0xFE == content[1])) {
return 100;
}
return 0;
}
/** */
/**
* 检查为ascii的或然率
*
* @param content
* @return
*/
private int asciiprobability(byte[] content) {
int score = 75;
int i, rawtextlen;
rawtextlen = content.length;
for (i = 0; i < rawtextlen; i++) {
if (content[i] < 0) {
score = score - 5;
} else if (content[i] == (byte) 0x1B) { // ESC (used by ISO 2022)
score = score - 5;
}
if (score <= 0) {
return 0;
}
}
return score;
}
/** */
/**
* 检查为euc_kr的或然率
*
* @param content
* @return
*/
private int euc_krprobability(byte[] content) {
int i, rawtextlen = 0;
int dbchars = 1, krchars = 1;
long krformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (content[i] >= 0) {
} else {
dbchars++;
if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xFE
&& (byte) 0xA1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xFE) {
krchars++;
totalformat += 500;
row = content[i] + 256 - 0xA1;
column = content[i + 1] + 256 - 0xA1;
if (EUC_KRformat[row][column] != 0) {
krformat += EUC_KRformat[row][column];
} else if (15 <= row && row < 55) {
krformat += 0;
}
}
i++;
}
}
rangeval = 50 * ((float) krchars / (float) dbchars);
formatval = 50 * ((float) krformat / (float) totalformat);
return (int) (rangeval + formatval);
}
private int euc_jpprobability(byte[] content) {
int i, rawtextlen = 0;
int dbchars = 1, jpchars = 1;
long jpformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (content[i] >= 0) {
} else {
dbchars++;
if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xFE
&& (byte) 0xA1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xFE) {
jpchars++;
totalformat += 500;
row = content[i] + 256 - 0xA1;
column = content[i + 1] + 256 - 0xA1;
if (JPformat[row][column] != 0) {
jpformat += JPformat[row][column];
} else if (15 <= row && row < 55) {
jpformat += 0;
}
}
i++;
}
}
rangeval = 50 * ((float) jpchars / (float) dbchars);
formatval = 50 * ((float) jpformat / (float) totalformat);
return (int) (rangeval + formatval);
}
private int sjisprobability(byte[] content) {
int i, rawtextlen = 0;
int dbchars = 1, jpchars = 1;
long jpformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column, adjust;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (content[i] >= 0) {
} else {
dbchars++;
if (i + 1 < content.length
&& (((byte) 0x81 <= content[i] && content[i] <= (byte) 0x9F) || ((byte) 0xE0 <= content[i] && content[i] <= (byte) 0xEF))
&& (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E) || ((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xFC))) {
jpchars++;
totalformat += 500;
row = content[i] + 256;
column = content[i + 1] + 256;
if (column < 0x9f) {
adjust = 1;
if (column > 0x7f) {
column -= 0x20;
} else {
column -= 0x19;
}
} else {
adjust = 0;
column -= 0x7e;
}
if (row < 0xa0) {
row = ((row - 0x70) << 1) - adjust;
} else {
row = ((row - 0xb0) << 1) - adjust;
}
row -= 0x20;
column = 0x20;
if (row < JPformat.length && column < JPformat[row].length
&& JPformat[row][column] != 0) {
jpformat += JPformat[row][column];
}
i++;
} else if ((byte) 0xA1 <= content[i]
&& content[i] <= (byte) 0xDF) {
}
}
}
rangeval = 50 * ((float) jpchars / (float) dbchars);
formatval = 50 * ((float) jpformat / (float) totalformat);
return (int) (rangeval + formatval) - 1;
}
}
package com.poi.examples;
public class EncodingTest{
public static void main(String argc[]) {
ParseEncoding parse;
parse = new ParseEncoding();
System.out.println("中国大陆:");
System.out.println("测试字符串,编码格式="+parse.getEncoding("百度".getBytes()));
System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.baidu.com"));
System.out.println();
System.out.println("中国台湾:");
System.out.println("测试字符串,编码格式="+parse.getEncoding("い地チ瓣".getBytes()));
System.out.println("测试站点,编码格式="+parse.getEncoding("http://tw.yahoo.com/"));
System.out.println("测试站点(繁体字,UTF编码),编码格式="+parse.getEncoding("http://www.javaworld.com.tw/jute"));
System.out.println();
System.out.println("日本:");
System.out.println("测试字符串,编码格式="+parse.getEncoding("その機能".getBytes()));
System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.4gamer.net"));
System.out.println();
System.out.println("自称蚩尤后代那群……:");
System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.easyjava.co.kr/"));
}
}
java字符串编码类型获取
最新推荐文章于 2023-02-14 16:44:17 发布