java字符串编码类型获取

package com.poi.examples;

/** *//**
 * <p>
 * Title: LoonFramework
 * </p>
 * <p>
 * Description:编码基本类型集合
 * </p>
 * <p>
 * Copyright: Copyright (c) 2008
 * </p>
 * <p>
 * Company: LoonFramework
 * </p>
 * <p>
 * License: http://www.apache.org/licenses/LICENSE-2.0
 * </p>
 * 
 * @author chenpeng
 * @email:ceponline@yahoo.com.cn
 * @version 0.1
 */
public class Encoding {

    // 支持的字符格式
    public static int GB2312 = 0;

    public static int GBK = 1;
    
    public static int BIG5 = 2;

    public static int UTF8 = 3;

    public static int UNICODE = 4;

    public static int EUC_KR = 5;

    public static int SJIS = 6;

    public static int EUC_JP = 7;

    public static int ASCII = 8;

    public static int UNKNOWN = 9;

    public static int TOTALT = 10;

    public final static int SIMP = 0;

    public final static int TRAD = 1;

    // 解析名称用
    public static String[] javaname;

    // 编码用
    public static String[] nicename;

    // 应用于html中的字符集
    public static String[] htmlname;

    public Encoding() {
        javaname = new String[TOTALT];
        nicename = new String[TOTALT];
        htmlname = new String[TOTALT];
        javaname[GB2312] = "GB2312";
        javaname[GBK] = "GBK";
        javaname[BIG5] = "BIG5";
        javaname[UTF8] = "UTF8";
        javaname[UNICODE] = "Unicode";
        javaname[EUC_KR] = "EUC_KR";
        javaname[SJIS] = "SJIS";
        javaname[EUC_JP] = "EUC_JP";
        javaname[ASCII] = "ASCII";
        javaname[UNKNOWN] = "ISO8859_1";

        // 分配编码名称
        htmlname[GB2312] = "GB2312";
        htmlname[GBK] = "GBK";
        htmlname[BIG5] = "BIG5";
        htmlname[UTF8] = "UTF-8";
        htmlname[UNICODE] = "UTF-16";
        htmlname[EUC_KR] = "EUC-KR";
        htmlname[SJIS] = "Shift_JIS";
        htmlname[EUC_JP] = "EUC-JP";
        htmlname[ASCII] = "ASCII";
        htmlname[UNKNOWN] = "ISO8859-1";

        // 分配可读名称
        nicename[GB2312] = "GB-2312";
        nicename[GBK] = "GBK";
        nicename[BIG5] = "Big5";
        nicename[UTF8] = "UTF-8";
        nicename[UNICODE] = "Unicode";
        nicename[EUC_KR] = "EUC-KR";
        nicename[SJIS] = "Shift-JIS";
        nicename[EUC_JP] = "EUC-JP";
        nicename[ASCII] = "ASCII";
        nicename[UNKNOWN] = "UNKNOWN";

    }

    public String toEncoding(final int type) {
        return (javaname[type] + "," + nicename[type] + "," + htmlname[type])
                .intern();
    }


}

package com.poi.examples;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;

public class ParseEncoding extends Encoding {

	int[][] GB2312format = new int[94][94];
	int[][] GBKformat = new int[126][191];
	int[][] Big5format = new int[94][158];
	int[][] EUC_KRformat = new int[94][94];
	int[][] JPformat = new int[94][94];

	public ParseEncoding() {
		super();
		// 初始化编码格式
		// init();
	}

	public String getEncoding(final String path) {
		return check(getEncodeValue(path));
	}

	public String getEncoding(final InputStream in) {
		return check(getEncodeValue(in));
	}

	public String getEncoding(final byte[] buffer) {
		return check(getEncodeValue(buffer));
	}

	public String getEncoding(final URL url) {
		return check(getEncodeValue(url));
	}

	private String check(final int result) {
		if (result == -1) {
			return nicename[UNKNOWN];
		}
		return nicename[result];
	}

	/** */
	/**
	 * 解析指定字符串路径编码所用格式
	 * 
	 * @param path
	 * @return
	 */
	private int getEncodeValue(String path) {
		int express = UNKNOWN;
		if (path.startsWith("http://")) {
			try {
				express = getEncodeValue(new URL(path));
			} catch (MalformedURLException e) {
				express = -1;
			}
		} else {
			express = getEncodeValue(new File(path));
		}
		return express;
	}

	/** */
	/**
	 * 
	 * 解析指定InputStream所用编码,返回或然率最高的编码类型数值
	 * 
	 * @param in
	 * @return
	 */
	public int getEncodeValue(InputStream in) {
		byte[] rawtext = new byte[8192];
		int bytesread = 0, byteoffset = 0;
		int express = UNKNOWN;
		InputStream stream = in;
		try {
			while ((bytesread = stream.read(rawtext, byteoffset, rawtext.length
					- byteoffset)) > 0) {
				byteoffset += bytesread;
			}
			;
			stream.close();
			express = getEncodeValue(rawtext);
		} catch (Exception e) {
			express = -1;
		}
		return express;
	}

	/** */
	/**
	 * 解析指定url下数据所用编码,返回或然率最高的编码类型数值
	 * 
	 * @param url
	 * @return
	 */
	public int getEncodeValue(URL url) {

		InputStream stream;
		try {
			stream = url.openStream();
		} catch (IOException e) {
			stream = null;
		}

		return getEncodeValue(stream);
	}

	/** */
	/**
	 * 解析指定file所用编码,返回或然率最高的编码类型数值
	 * 
	 * @param file
	 * @return
	 */
	public int getEncodeValue(File file) {
		byte[] buffer;
		try {
			buffer = read(new FileInputStream(file));
		} catch (FileNotFoundException e) {
			buffer = null;
		}
		return getEncodeValue(buffer);
	}

	/** */
	/**
	 * 将inputstream转为byte[]
	 * 
	 * @param inputStream
	 * @return
	 */
	private final byte[] read(final InputStream inputStream) {
		byte[] arrayByte = null;
		ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
		byte[] bytes = new byte[8192];
		try {
			bytes = new byte[inputStream.available()];
			int read;
			while ((read = inputStream.read(bytes)) >= 0) {
				byteArrayOutputStream.write(bytes, 0, read);
			}
			arrayByte = byteArrayOutputStream.toByteArray();
		} catch (IOException e) {
			return null;
		}
		return arrayByte;
	}

	/** */
	/**
	 * 解析指定byte[]所用编码,返回或然率最高的数值类型
	 * 
	 * @param content
	 * @return
	 */
	public int getEncodeValue(byte[] content) {
		if (content == null)
			return -1;
		int[] scores;
		int index, maxscore = 0;
		int encoding = UNKNOWN;
		scores = new int[TOTALT];
		// 分配或然率
		scores[GB2312] = gb2312probability(content);
		scores[GBK] = gbkprobability(content);
		scores[BIG5] = big5probability(content);
		scores[UTF8] = utf8probability(content);
		scores[UNICODE] = utf16probability(content);
		scores[EUC_KR] = euc_krprobability(content);
		scores[ASCII] = asciiprobability(content);
		scores[SJIS] = sjisprobability(content);
		scores[EUC_JP] = euc_jpprobability(content);
		scores[UNKNOWN] = 0;

		// 概率比较
		for (index = 0; index < TOTALT; index++) {
			if (scores[index] > maxscore) {
				// 索引
				encoding = index;
				// 最大几率
				maxscore = scores[index];
			}
		}
		// 返回或然率大于50%的数据
		if (maxscore <= 50) {
			encoding = UNKNOWN;
		}
		return encoding;
	}

	/** */
	/**
	 * gb2312数据或然率计算
	 * 
	 * @param content
	 * @return
	 */
	private int gb2312probability(byte[] content) {
		int i, rawtextlen = 0;

		int dbchars = 1, gbchars = 1;
		long gbformat = 0, totalformat = 1;
		float rangeval = 0, formatval = 0;
		int row, column;

		// 检查是否在亚洲汉字范围内
		rawtextlen = content.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (content[i] >= 0) {
			} else {
				dbchars++;
				// 汉字GB码由两个字节组成,每个字节的范围是0xA1 ~ 0xFE
				if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xF7
						&& (byte) 0xA1 <= content[i + 1]
						&& content[i + 1] <= (byte) 0xFE) {
					gbchars++;
					totalformat += 500;
					row = content[i] + 256 - 0xA1;
					column = content[i + 1] + 256 - 0xA1;
					if (GB2312format[row][column] != 0) {
						gbformat += GB2312format[row][column];
					} else if (15 <= row && row < 55) {
						// 在gb编码范围
						gbformat += 200;
					}

				}
				i++;
			}
		}
		rangeval = 50 * ((float) gbchars / (float) dbchars);
		formatval = 50 * ((float) gbformat / (float) totalformat);

		return (int) (rangeval + formatval);
	}

	/** */
	/**
	 * gb2312或然率计算
	 * 
	 * @param content
	 * @return
	 */
	private int gbkprobability(byte[] content) {
		int i, rawtextlen = 0;

		int dbchars = 1, gbchars = 1;
		long gbformat = 0, totalformat = 1;
		float rangeval = 0, formatval = 0;
		int row, column;
		rawtextlen = content.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (content[i] >= 0) {
			} else {
				dbchars++;
				if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xF7
						&& // gb范围
						(byte) 0xA1 <= content[i + 1]
						&& content[i + 1] <= (byte) 0xFE) {
					gbchars++;
					totalformat += 500;
					row = content[i] + 256 - 0xA1;
					column = content[i + 1] + 256 - 0xA1;
					if (GB2312format[row][column] != 0) {
						gbformat += GB2312format[row][column];
					} else if (15 <= row && row < 55) {
						gbformat += 200;
					}

				} else if ((byte) 0x81 <= content[i]
						&& content[i] <= (byte) 0xFE && // gb扩展区域
						(((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E))) {
					gbchars++;
					totalformat += 500;
					row = content[i] + 256 - 0x81;
					if (0x40 <= content[i + 1] && content[i + 1] <= 0x7E) {
						column = content[i + 1] - 0x40;
					} else {
						column = content[i + 1] + 256 - 0x40;
					}
					if (GBKformat[row][column] != 0) {
						gbformat += GBKformat[row][column];
					}
				}
				i++;
			}
		}
		rangeval = 50 * ((float) gbchars / (float) dbchars);
		formatval = 50 * ((float) gbformat / (float) totalformat);
		return (int) (rangeval + formatval) - 1;
	}

	/** */
	/**
	 * 解析为big5的或然率
	 * 
	 * @param content
	 * @return
	 */
	private int big5probability(byte[] content) {
		int i, rawtextlen = 0;
		int dbchars = 1, bfchars = 1;
		float rangeval = 0, formatval = 0;
		long bfformat = 0, totalformat = 1;
		int row, column;
		rawtextlen = content.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (content[i] >= 0) {
			} else {
				dbchars++;
				if ((byte) 0xA1 <= content[i]
						&& content[i] <= (byte) 0xF9
						&& (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= content[i + 1] && content[i + 1] <= (byte) 0xFE))) {
					bfchars++;
					totalformat += 500;
					row = content[i] + 256 - 0xA1;
					if (0x40 <= content[i + 1] && content[i + 1] <= 0x7E) {
						column = content[i + 1] - 0x40;
					} else {
						column = content[i + 1] + 256 - 0x61;
					}
					if (Big5format[row][column] != 0) {
						bfformat += Big5format[row][column];
					} else if (3 <= row && row <= 37) {
						bfformat += 200;
					}
				}
				i++;
			}
		}
		rangeval = 50 * ((float) bfchars / (float) dbchars);
		formatval = 50 * ((float) bfformat / (float) totalformat);

		return (int) (rangeval + formatval);
	}

	/** */
	/**
	 * 在utf-8中的或然率
	 * 
	 * @param content
	 * @return
	 */
	private int utf8probability(byte[] content) {
		int score = 0;
		int i, rawtextlen = 0;
		int goodbytes = 0, asciibytes = 0;
		// 检查是否为汉字可接受范围
		rawtextlen = content.length;
		for (i = 0; i < rawtextlen; i++) {
			if ((content[i] & (byte) 0x7F) == content[i]) {
				asciibytes++;
			} else if (-64 <= content[i] && content[i] <= -33
					&& i + 1 < rawtextlen && -128 <= content[i + 1]
					&& content[i + 1] <= -65) {
				goodbytes += 2;
				i++;
			} else if (-32 <= content[i] && content[i] <= -17
					&& i + 2 < rawtextlen && -128 <= content[i + 1]
					&& content[i + 1] <= -65 && -128 <= content[i + 2]
					&& content[i + 2] <= -65) {
				goodbytes += 3;
				i += 2;
			}
		}

		if (asciibytes == rawtextlen) {
			return 0;
		}

		score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));
		// 如果不高于98则减少到零
		if (score > 98) {
			return score;
		} else if (score > 95 && goodbytes > 30) {
			return score;
		} else {
			return 0;
		}

	}

	/** */
	/**
	 * 检查为utf-16的或然率
	 * 
	 * @param content
	 * @return
	 */
	private int utf16probability(byte[] content) {

		if (content.length > 1
				&& ((byte) 0xFE == content[0] && (byte) 0xFF == content[1])
				|| ((byte) 0xFF == content[0] && (byte) 0xFE == content[1])) {
			return 100;
		}
		return 0;
	}

	/** */
	/**
	 * 检查为ascii的或然率
	 * 
	 * @param content
	 * @return
	 */
	private int asciiprobability(byte[] content) {
		int score = 75;
		int i, rawtextlen;

		rawtextlen = content.length;

		for (i = 0; i < rawtextlen; i++) {
			if (content[i] < 0) {
				score = score - 5;
			} else if (content[i] == (byte) 0x1B) { // ESC (used by ISO 2022)
				score = score - 5;
			}
			if (score <= 0) {
				return 0;
			}
		}
		return score;
	}

	/** */
	/**
	 * 检查为euc_kr的或然率
	 * 
	 * @param content
	 * @return
	 */
	private int euc_krprobability(byte[] content) {
		int i, rawtextlen = 0;

		int dbchars = 1, krchars = 1;
		long krformat = 0, totalformat = 1;
		float rangeval = 0, formatval = 0;
		int row, column;
		rawtextlen = content.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (content[i] >= 0) {
			} else {
				dbchars++;
				if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xFE
						&& (byte) 0xA1 <= content[i + 1]
						&& content[i + 1] <= (byte) 0xFE) {
					krchars++;
					totalformat += 500;
					row = content[i] + 256 - 0xA1;
					column = content[i + 1] + 256 - 0xA1;
					if (EUC_KRformat[row][column] != 0) {
						krformat += EUC_KRformat[row][column];
					} else if (15 <= row && row < 55) {
						krformat += 0;
					}

				}
				i++;
			}
		}
		rangeval = 50 * ((float) krchars / (float) dbchars);
		formatval = 50 * ((float) krformat / (float) totalformat);

		return (int) (rangeval + formatval);
	}

	private int euc_jpprobability(byte[] content) {
		int i, rawtextlen = 0;

		int dbchars = 1, jpchars = 1;
		long jpformat = 0, totalformat = 1;
		float rangeval = 0, formatval = 0;
		int row, column;

		rawtextlen = content.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (content[i] >= 0) {
			} else {
				dbchars++;
				if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xFE
						&& (byte) 0xA1 <= content[i + 1]
						&& content[i + 1] <= (byte) 0xFE) {
					jpchars++;
					totalformat += 500;
					row = content[i] + 256 - 0xA1;
					column = content[i + 1] + 256 - 0xA1;
					if (JPformat[row][column] != 0) {
						jpformat += JPformat[row][column];
					} else if (15 <= row && row < 55) {
						jpformat += 0;
					}

				}
				i++;
			}
		}
		rangeval = 50 * ((float) jpchars / (float) dbchars);
		formatval = 50 * ((float) jpformat / (float) totalformat);

		return (int) (rangeval + formatval);
	}

	private int sjisprobability(byte[] content) {
		int i, rawtextlen = 0;

		int dbchars = 1, jpchars = 1;
		long jpformat = 0, totalformat = 1;
		float rangeval = 0, formatval = 0;
		int row, column, adjust;

		rawtextlen = content.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (content[i] >= 0) {
			} else {
				dbchars++;
				if (i + 1 < content.length
						&& (((byte) 0x81 <= content[i] && content[i] <= (byte) 0x9F) || ((byte) 0xE0 <= content[i] && content[i] <= (byte) 0xEF))
						&& (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E) || ((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xFC))) {
					jpchars++;
					totalformat += 500;
					row = content[i] + 256;
					column = content[i + 1] + 256;
					if (column < 0x9f) {
						adjust = 1;
						if (column > 0x7f) {
							column -= 0x20;
						} else {
							column -= 0x19;
						}
					} else {
						adjust = 0;
						column -= 0x7e;
					}
					if (row < 0xa0) {
						row = ((row - 0x70) << 1) - adjust;
					} else {
						row = ((row - 0xb0) << 1) - adjust;
					}

					row -= 0x20;
					column = 0x20;
					if (row < JPformat.length && column < JPformat[row].length
							&& JPformat[row][column] != 0) {
						jpformat += JPformat[row][column];
					}
					i++;
				} else if ((byte) 0xA1 <= content[i]
						&& content[i] <= (byte) 0xDF) {
				}

			}
		}
		rangeval = 50 * ((float) jpchars / (float) dbchars);
		formatval = 50 * ((float) jpformat / (float) totalformat);

		return (int) (rangeval + formatval) - 1;
	}

}


package com.poi.examples;

public class EncodingTest{
    public static void main(String argc[])  {
        ParseEncoding parse;

        parse = new ParseEncoding();
        
         System.out.println("中国大陆:");
         System.out.println("测试字符串,编码格式="+parse.getEncoding("百度".getBytes()));
         System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.baidu.com"));
         System.out.println();
         System.out.println("中国台湾:");
         System.out.println("测试字符串,编码格式="+parse.getEncoding("い地チ瓣".getBytes()));
         System.out.println("测试站点,编码格式="+parse.getEncoding("http://tw.yahoo.com/"));
         System.out.println("测试站点(繁体字,UTF编码),编码格式="+parse.getEncoding("http://www.javaworld.com.tw/jute"));
         System.out.println();
         System.out.println("日本:");
         System.out.println("测试字符串,编码格式="+parse.getEncoding("その機能".getBytes()));
         System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.4gamer.net"));
         System.out.println();
         System.out.println("自称蚩尤后代那群……:");
         System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.easyjava.co.kr/"));
        
    }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值