UTF-8, Unicode, GB2312三种编码方式解析, 深入研究汉字编码

Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/-->using System;
using System.Collections.Generic;
using System.Globalization;

namespace Encoding
{
    internal class Program
    {
        private static void Main(string[] args)
        {
            var gb1 = System.Text.Encoding.GetEncoding("Unicode");
            Console.WriteLine(gb1.GetString(gb1.GetBytes("测试")));

            var gb0 = System.Text.Encoding.GetEncoding("UTF-8");
            Console.WriteLine(gb0.GetString(gb0.GetBytes("测试")));

            var gb = System.Text.Encoding.GetEncoding("GB2312");
            Console.WriteLine(gb.GetString(gb.GetBytes("测试")));

            #region 测试代码

            Console.WriteLine(IsChineseLetter("测试", 0));
            Console.WriteLine(ChineseLetterCode("测试", 0));
            Console.WriteLine(ChineseLetterFromCode(ChineseLetterCode("测试", 0)));

            Console.WriteLine(gb1.GetString(
                Utf8_2_Unicode(gb0.GetBytes("测试test"))));

            //http://qkzz.net/article/3d697483-a5ae-4b50-9ae9-45dc6dd26141.htm
            //http://topic.csdn.net/u/20090617/18/1907627e-ce38-4ae5-9755-1cc349a4ed1a.html
            //一级汉字有 3755 个, 40 * 94=3760 个, 其中 d7fe, d7fd, d7fc, d7fb, d7fa 五位置为空
            for (byte i = 0xb0; i < 0xd8; i++)
            {
                for (byte j = 0xa1; j < (i != 0xd7 ? 0xff : 0xfa); j++)
                {
                    Console.Write(gb.GetString(new[] {i, j}));
                    if (j == 0xc7 || j == 0xee || j == (i != 0xd7 ? 0xfe : 0xf9))
                        Console.WriteLine();
                }
            }

            Console.WriteLine(GetChineseLetterFromGb2312(0));
            Console.WriteLine(GetChineseLetterFromGb2312(3754));

            //汉字的 Unicode 编码范围
            for (var i = 19968; i <= 40959; i++)
            {
                Console.Write(ChineseLetterFromCode(i));
            }

            #endregion

            Console.Read();
        }

        public static string GetChineseLetterFromGb2312(int rNum)
        {
            if (rNum < 0 || rNum > 3754)
                throw new ArgumentOutOfRangeException("rNum", "超出一级汉字的范围!");
            var gb = System.Text.Encoding.GetEncoding("GB2312");
            return gb.GetString(new[] {(byte) (0xb0 + (rNum/94)), (byte) (0xa1 + (rNum%94))});
        }

        /// <summary>
        /// UTF8 汉字字节流转成 Unicode 汉字字节流
        /// </summary>
        /// <param name="input"></param>
        /// <see cref="http://hi.baidu.com/hyqsoft/blog/item/263795a164d1728346106464.html"/>
        public static byte[] Utf8_2_Unicode(byte[] input)
        {
            var ret = new List<byte>();
            for (var i = 0; i < input.Length; i++)
            {
                if (input[i] >= 240) // 11110xxx
                {
                    //i += 3;
                    throw new Exception("四字节的 UTF-8 字符不能转换成两字节的 Unicode 字符!");
                }
                //else if (input[i] >= 224)
                if (input[i] >= 224) // 1110xxxx
                {
                    ret.Add((byte) ((input[i + 2] & 63) | ((input[i + 1] & 3) << 6)));
                    ret.Add((byte) ((input[i] << 4) | ((input[i + 1] & 60) >> 2)));
                    i += 2;
                }
                else if (input[i] >= 192) // 110xxxxx
                {
                    ret.Add((byte) ((input[i + 1] & 63) | ((input[i] & 3) << 6)));
                    ret.Add((byte) ((input[i] & 28) >> 2));
                    i += 1;
                }
                else
                {
                    ret.Add(input[i]);
                    ret.Add(0);
                }
            }
            return ret.ToArray();
        }

        #region 汉字与Unicode编码

        public static bool IsChineseLetter(string input, int index)
        {
            var chfrom = Convert.ToInt32("4e00", 16); //范围(0x4e00~0x9fff)转换成int(chfrom~chend)
            var chend = Convert.ToInt32("9fa5", 16);
            if (input != "")
            {
                //var code = Char.ConvertToUtf32(input, index);
                var gb = System.Text.Encoding.GetEncoding("Unicode");
                var b = gb.GetBytes(input.Substring(index, 1));
                var code = b[0] + b[1]*0x100;

                return code >= chfrom && code <= chend;
            }
            return false;
        }

        public static int ChineseLetterCode(string input, int index)
        {
            var chfrom = Convert.ToInt32("4e00", 16); //范围(0x4e00~0x9fff)转换成int(chfrom~chend)
            var chend = Convert.ToInt32("9fa5", 16);
            if (input != "")
            {
                var code = Char.ConvertToUtf32(input, index);

                return code >= chfrom && code <= chend ? code : 0;
            }
            return 0;
        }

        public static string ChineseLetterHexCode(string input, int index)
        {
            var code = ChineseLetterCode(input, index);
            return code != 0 ? code.ToString("X4") : string.Empty;
        }

        public static string ChineseLetterFromCode(int code)
        {
            var chfrom = Convert.ToInt32("4e00", 16); //范围(0x4e00~0x9fff)转换成int(chfrom~chend)
            var chend = Convert.ToInt32("9fa5", 16);
            //return code >= chfrom && code <= chend ? Char.ConvertFromUtf32(code) : string.Empty;
            if (code >= chfrom && code <= chend)
            {
                var gb = System.Text.Encoding.GetEncoding("Unicode");
                var b = new[] {(byte) (code%0x100), (byte) (code/0x100)};
                return gb.GetString(b);
            }
            return string.Empty;
        }

        public static string ChineseLetterFromHexCode(string hexCode)
        {
            //var code = Convert.ToInt32(hexCode, 16);
            var code = int.Parse(hexCode, NumberStyles.HexNumber);
            return ChineseLetterFromCode(code);
        }

        #endregion
    }
}
包含了处理汉字与Unicode编码转换的多种方法,从UTF-8字节流得到Unicode字节流的方法(从而可以用 Unicode编码处理方法处理UTF-8编码)。研究GB2312编码规范,得到最简单的根据一个整数得到一个一级汉字的方法,根据这个方法可以写出最简单生成一级汉字验证码程序~
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值