关于字符编码以及如何在程序中处理unicode,本想写篇文章总结学习一下,但发现有前人已经论述的很完善了,不再重复,可以参考:http://www.regexlab.com/zh/encoding.htm。
在C++中,字符有char和wchar之分,相应的,字符串有string和wstring两种。C#中,string是一个unicode字符串,相应的,每个char都是16位。
源文件中出现的字符串常量,都会被自动转换为unicode编码(utf16),利用Text.Encoding,可以实现不同编码间的转换。
- using System;
- using System.Text;
-
- namespace test
- {
- class Program
- {
- static void Main(string[] args)
- {
- string u16s = "忘記了啊abc";
-
-
- Encoding utf8 = Encoding.UTF8;
- Encoding utf16 = Encoding.Unicode;
- Encoding gb = Encoding.GetEncoding("gbk");
- Encoding b5 = Encoding.GetEncoding("big5");
-
-
- byte[] u16bytes = utf16.GetBytes(u16s);
- byte[] u8bytes = Encoding.Convert(utf16, utf8, u16bytes);
- byte[] gbytes = Encoding.Convert(utf16, gb, u16bytes);
- byte[] bbytes = Encoding.Convert(utf16, b5, u16bytes);
-
- Console.Write("unicode: ");
- foreach (byte c in u16bytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
- Console.Write("utf8: ");
- foreach(byte c in u8bytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
- Console.Write("gbk: ");
- foreach (byte c in gbytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
- Console.Write("big5: ");
- foreach (byte c in bbytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
-
- string u8s = utf8.GetString(u8bytes);
- string gs = gb.GetString(gbytes);
- string bs = b5.GetString(bbytes);
-
- Console.WriteLine("unicode: " + u16s + " " + u16s.Length.ToString());
- Console.WriteLine("utf8: " + u8s + " " + u16s.Length.ToString());
- Console.WriteLine("gbk: " + gs + " " + gs.Length.ToString());
- Console.WriteLine("big5: " + bs + " " + bs.Length.ToString());
-
- Console.Write("unicode: ");
- foreach (char c in u16s)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
- Console.Write("utf8: ");
- foreach (char c in u8s)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
- Console.Write("gb2312: ");
- foreach (char c in gs)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
- Console.Write("big5: ");
- foreach (char c in bs)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
-
- Console.ReadKey();
- }
- }
- }
//以上程序的输出结果:
//以下是4种编码的字节串
unicode: d8 5f 18 8a 86 4e 4a 55 61 0 62 0 63 0
utf8: e5 bf 98 e8 a8 98 e4 ba 86 e5 95 8a 61 62 63
gbk: cd fc d3 9b c1 cb b0 a1 61 62 63
big5: a7 d1 b0 4f a4 46 b0 da 61 62 63
//通过GetString()方法得到的4个string
unicode: 忘記了啊abc 7
utf8: 忘記了啊abc 7
gbk: 忘記了啊abc 7
big5: 忘記了啊abc 7
//4个string中的字符编码
unicode: 5fd8 8a18 4e86 554a 61 62 63
utf8: 5fd8 8a18 4e86 554a 61 62 63
gb2312: 5fd8 8a18 4e86 554a 61 62 63
big5: 5fd8 8a18 4e86 554a 61 62 63
可以看到,使用GetString方法,得到的string都是unicode编码的,也就是说,它的作用是把各种编码的byte数组“解码”为一个unicode字符串。