标准的Unicode称为UTF-16(UTF:UCSTransformation Format ),后来为了双字节的Unicode能够在现存的处理单字节的系统上正确传输,出现了UTF-8。UTF-8, 8bit编码, ASCII不作变换, 其他字符做变长编码, 每个字符1-3 byte. 其中英文字母用一个字节表示,中文使用三个字节。
java默认采用unicode编码,2个字节(16位)来表示一个字符。
public class Unicode {
public static void main(String[] args) {
String str= "汉";
char x = '汉';
byte[] bytes=null;
byte[] unicode_byte=null;
byte[] utf16_byte=null;
byte[] utf8_byte=null;
byte[] gb2312_byte=null;
byte[] bytes1=null;
try {
bytes = str.getBytes();
unicode_byte = str.getBytes("unicode");
utf16_byte = str.getBytes("utf-16");
utf8_byte = str.getBytes("utf-8");
gb2312_byte = str.getBytes("GB2312");
bytes1 = charToBytes(x);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("默认 bytes = " + bytes.length + ",bytes = "+ bytes2HexString(bytes));
System.out.println("unicode_byte = " + unicode_byte.length + ",unicode_byte = "+ bytes2HexString(unicode_byte));
System.out.println("utf16_byte = " + utf16_byte.length + ",utf16_byte = "+ bytes2HexString(utf16_byte));
System.out.println("utf8_byte = " + utf8_byte.length + ",utf8_byte = " + bytes2HexString(utf8_byte));
System.out.println("gb2312_byte = " + gb2312_byte.length + ",gb2312_byte = "+ bytes2HexString(gb2312_byte));
System.out.println("bytes1 len = " + bytes1.length + ",bytes1 = "+ bytes2HexString(bytes1));
}
public static byte[] charToBytes(char c) {
byte[] b = new byte[2];
b[0] = (byte) ((c & 0xFF00) >> 8);
b[1] = (byte) (c & 0xFF);
return b;
}
public static String bytes2HexString(byte[] b) {
String ret = "";
for (int i = 0; i < b.length; i++) {
String hex = Integer.toHexString(b[i] & 0xFF);
ret += hex.toUpperCase();
}
ret = "0X" + ret;
return ret;
}
}
运行的结果为:
默认 bytes = 2,bytes = 0XBABA
unicode_byte = 4,unicode_byte = 0XFEFF6C49
utf16_byte = 4,utf16_byte = 0XFEFF6C49
utf8_byte = 3,utf8_byte = 0XE6B189
gb2312_byte = 2,gb2312_byte = 0XBABA
bytes1 len = 2,bytes1 = 0X6C49