Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/-->using System;
using System.Collections.Generic;
using System.Globalization;
namespace Encoding
{
internal class Program
{
private static void Main(string[] args)
{
var gb1 = System.Text.Encoding.GetEncoding("Unicode");
Console.WriteLine(gb1.GetString(gb1.GetBytes("测试")));
var gb0 = System.Text.Encoding.GetEncoding("UTF-8");
Console.WriteLine(gb0.GetString(gb0.GetBytes("测试")));
var gb = System.Text.Encoding.GetEncoding("GB2312");
Console.WriteLine(gb.GetString(gb.GetBytes("测试")));
#region 测试代码
Console.WriteLine(IsChineseLetter("测试", 0));
Console.WriteLine(ChineseLetterCode("测试", 0));
Console.WriteLine(ChineseLetterFromCode(ChineseLetterCode("测试", 0)));
Console.WriteLine(gb1.GetString(
Utf8_2_Unicode(gb0.GetBytes("测试test"))));
//http://qkzz.net/article/3d697483-a5ae-4b50-9ae9-45dc6dd26141.htm
//http://topic.csdn.net/u/20090617/18/1907627e-ce38-4ae5-9755-1cc349a4ed1a.html
//一级汉字有 3755 个, 40 * 94=3760 个, 其中 d7fe, d7fd, d7fc, d7fb, d7fa 五位置为空
for (byte i = 0xb0; i < 0xd8; i++)
{
for (byte j = 0xa1; j < (i != 0xd7 ? 0xff : 0xfa); j++)
{
Console.Write(gb.GetString(new[] {i, j}));
if (j == 0xc7 || j == 0xee || j == (i != 0xd7 ? 0xfe : 0xf9))
Console.WriteLine();
}
}
Console.WriteLine(GetChineseLetterFromGb2312(0));
Console.WriteLine(GetChineseLetterFromGb2312(3754));
//汉字的 Unicode 编码范围
for (var i = 19968; i <= 40959; i++)
{
Console.Write(ChineseLetterFromCode(i));
}
#endregion
Console.Read();
}
public static string GetChineseLetterFromGb2312(int rNum)
{
if (rNum < 0 || rNum > 3754)
throw new ArgumentOutOfRangeException("rNum", "超出一级汉字的范围!");
var gb = System.Text.Encoding.GetEncoding("GB2312");
return gb.GetString(new[] {(byte) (0xb0 + (rNum/94)), (byte) (0xa1 + (rNum%94))});
}
/// <summary>
/// UTF8 汉字字节流转成 Unicode 汉字字节流
/// </summary>
/// <param name="input"></param>
/// <see cref="http://hi.baidu.com/hyqsoft/blog/item/263795a164d1728346106464.html"/>
public static byte[] Utf8_2_Unicode(byte[] input)
{
var ret = new List<byte>();
for (var i = 0; i < input.Length; i++)
{
if (input[i] >= 240) // 11110xxx
{
//i += 3;
throw new Exception("四字节的 UTF-8 字符不能转换成两字节的 Unicode 字符!");
}
//else if (input[i] >= 224)
if (input[i] >= 224) // 1110xxxx
{
ret.Add((byte) ((input[i + 2] & 63) | ((input[i + 1] & 3) << 6)));
ret.Add((byte) ((input[i] << 4) | ((input[i + 1] & 60) >> 2)));
i += 2;
}
else if (input[i] >= 192) // 110xxxxx
{
ret.Add((byte) ((input[i + 1] & 63) | ((input[i] & 3) << 6)));
ret.Add((byte) ((input[i] & 28) >> 2));
i += 1;
}
else
{
ret.Add(input[i]);
ret.Add(0);
}
}
return ret.ToArray();
}
#region 汉字与Unicode编码
public static bool IsChineseLetter(string input, int index)
{
var chfrom = Convert.ToInt32("4e00", 16); //范围(0x4e00~0x9fff)转换成int(chfrom~chend)
var chend = Convert.ToInt32("9fa5", 16);
if (input != "")
{
//var code = Char.ConvertToUtf32(input, index);
var gb = System.Text.Encoding.GetEncoding("Unicode");
var b = gb.GetBytes(input.Substring(index, 1));
var code = b[0] + b[1]*0x100;
return code >= chfrom && code <= chend;
}
return false;
}
public static int ChineseLetterCode(string input, int index)
{
var chfrom = Convert.ToInt32("4e00", 16); //范围(0x4e00~0x9fff)转换成int(chfrom~chend)
var chend = Convert.ToInt32("9fa5", 16);
if (input != "")
{
var code = Char.ConvertToUtf32(input, index);
return code >= chfrom && code <= chend ? code : 0;
}
return 0;
}
public static string ChineseLetterHexCode(string input, int index)
{
var code = ChineseLetterCode(input, index);
return code != 0 ? code.ToString("X4") : string.Empty;
}
public static string ChineseLetterFromCode(int code)
{
var chfrom = Convert.ToInt32("4e00", 16); //范围(0x4e00~0x9fff)转换成int(chfrom~chend)
var chend = Convert.ToInt32("9fa5", 16);
//return code >= chfrom && code <= chend ? Char.ConvertFromUtf32(code) : string.Empty;
if (code >= chfrom && code <= chend)
{
var gb = System.Text.Encoding.GetEncoding("Unicode");
var b = new[] {(byte) (code%0x100), (byte) (code/0x100)};
return gb.GetString(b);
}
return string.Empty;
}
public static string ChineseLetterFromHexCode(string hexCode)
{
//var code = Convert.ToInt32(hexCode, 16);
var code = int.Parse(hexCode, NumberStyles.HexNumber);
return ChineseLetterFromCode(code);
}
#endregion
}
}
包含了处理汉字与Unicode编码转换的多种方法,从UTF-8字节流得到Unicode字节流的方法(从而可以用 Unicode编码处理方法处理UTF-8编码)。研究GB2312编码规范,得到最简单的根据一个整数得到一个一级汉字的方法,根据这个方法可以写出最简单生成一级汉字验证码程序~
UTF-8, Unicode, GB2312三种编码方式解析, 深入研究汉字编码
最新推荐文章于 2019-11-19 17:01:13 发布