朋友让我帮他写个gb2312->utf-8的字符转换程序,找了半天没有在网上找到合适的,于是自己动手写了一个,呵呵。把它贴在这里,免得以后忘记了 ^_^
实现思路大致如下:
- 取得一个汉字的Unicode码
- 把Unicode码分解为两个16进制数据字符串(丢弃前两个字节)
- 把这两个16进制数据字符串转换成二进制数据字符串
- 把二进制数据字符串分解为三个串,第一个串为4(0~4)个位,在高位加上标记位“1110”,第二(4~10)、三个(10~16)串均为6个位,分别在高位加上“10”标记位
- 把这三个二进制串分别转换为10进制数据并赋值给字节型数组
- 根据这个字节型数组构造UTF-8字符
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.UnsupportedEncodingException;
- /**
- * 2007-8-10 jyin at gomez dot com
- */
- public class CharsetConvertor {
- public static void main(String[] args) {
- String str = "This is a test for *中网!@#$。,?";
- try {
- File f = new File("D:/test.txt");
- FileOutputStream fio = new FileOutputStream(f);
- String s = gbToUtf8(str);
- fio.write(s.getBytes("UTF-8"));
- fio.close();
- }
- catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static String gbToUtf8(String str) throws UnsupportedEncodingException {
- StringBuffer sb = new StringBuffer();
- for (int i = 0; i < str.length(); i++) {
- String s = str.substring(i, i + 1);
- if (s.charAt(0) > 0x80) {
- byte[] bytes = s.getBytes("Unicode");
- String binaryStr = "";
- for (int j = 2; j < bytes.length; j += 2) {
- // the first byte
- String hexStr = getHexString(bytes[j + 1]);
- String binStr = getBinaryString(Integer.valueOf(hexStr, 16));
- binaryStr += binStr;
- // the second byte
- hexStr = getHexString(bytes[j]);
- binStr = getBinaryString(Integer.valueOf(hexStr, 16));
- binaryStr += binStr;
- }
- // convert unicode to utf-8
- String s1 = "1110" + binaryStr.substring(0, 4);
- String s2 = "10" + binaryStr.substring(4, 10);
- String s3 = "10" + binaryStr.substring(10, 16);
- byte[] bs = new byte[3];
- bs[0] = Integer.valueOf(s1, 2).byteValue();
- bs[1] = Integer.valueOf(s2, 2).byteValue();
- bs[2] = Integer.valueOf(s3, 2).byteValue();
- String ss = new String(bs, "UTF-8");
- sb.append(ss);
- } else {
- sb.append(s);
- }
- }
- return sb.toString();
- }
- private static String getHexString(byte b) {
- String hexStr = Integer.toHexString(b);
- int m = hexStr.length();
- if (m < 2) {
- hexStr = "0" + hexStr;
- } else {
- hexStr = hexStr.substring(m - 2);
- }
- return hexStr;
- }
- private static String getBinaryString(int i) {
- String binaryStr = Integer.toBinaryString(i);
- int length = binaryStr.length();
- for (int l = 0; l < 8 - length; l++) {
- binaryStr = "0" + binaryStr;
- }
- return binaryStr;
- }
- }
参见java.util.Properties。
用java程序将GBK字符转成UTF-8编码格式 (转)
原地址:http://blog.csdn.net/wangjichen_1/archive/2006/08/04/1019830.aspx
上网查找一下 用java程序将GBK字符转成UTF-8编码格式的代码,发现全是C++ 写的,而且很烦琐,
现在自己写了一个java的,以供参考。
package com.lang.string;
public class ConverFromGBKToUTF8 {
public static void main(String[] args){
try {
ConverFromGBKToUTF8 convert = new ConverFromGBKToUTF8();
byte [] fullByte = convert.gbk2utf8(chenese);
String fullStr = new String(fullByte, "UTF-8");
System.out.println("string from GBK to UTF-8 byte: " + fullStr);
} catch (Exception e) {
e.printStackTrace();
}
}
public byte[] gbk2utf8(String chenese){
char c[] = chenese.toCharArray();
byte [] fullByte =new byte[3*c.length];
for(int i=0; i<c.length; i++){
int m = (int)c[i];
String word = Integer.toBinaryString(m);
// System.out.println(word);
StringBuffer sb = new StringBuffer();
int len = 16 - word.length();
//补零
for(int j=0; j<len; j++){
sb.append("0");
}
sb.append(word);
sb.insert(0, "1110");
sb.insert(8, "10");
sb.insert(16, "10");
// System.out.println(sb.toString());
String s1 = sb.substring(0, 8);
String s2 = sb.substring(8, 16);
String s3 = sb.substring(16);
byte b0 = Integer.valueOf(s1, 2).byteValue();
byte b1 = Integer.valueOf(s2, 2).byteValue();
byte b2 = Integer.valueOf(s3, 2).byteValue();
byte[] bf = new byte[3];
bf[0] = b0;
fullByte[i*3] = bf[0];
bf[1] = b1;
fullByte[i*3+1] = bf[1];
bf[2] = b2;
fullByte[i*3+2] = bf[2];
}
return fullByte;
}
}
UTF-8的编码原理和特性:
U+0000~U+007E 1 _ _ _ _ _ _ _ (7bits)
U+0080~U+07FF 1 1 0_ _ _ _ _ 1 0_ _ _ _ _ _ (11bits)
U+0800~U+FFFF 1 1 1 0 _ _ _ _ 1 0 _ _ _ _ _ _ 1 0 _ _ _ _ _ _ (16bits)