于是乎,必须有一种手段将 unicode 码和汉字字模的数据对应起来。最常用的手段是做一个 unicode 码表,在该数组中查找到匹配的 unicode 码后,用匹配的 index(数组索引)值在另外一个由该 index 值对应的字模记录的数组中的数据去显示。
+-----------------+ 查表 +-----------------+ 同index +-------------------+
| 汉字的unicode码 | ==> | unicode码表数组 | =======> | 汉字字模数据数组 | ==> 显示输出
+-----------------+ +-----------------+ +-------------------+
本文简要介绍一下如何生成 unicode 码表,其它相关的汉字处理技术不在本文的讨论范围之内。:)
用下面两个函数可以把 unicode 码表构造出来
void UnicodeToGB2312(unsigned char* pOut,unsigned short uData) { WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(unsigned short),NULL,NULL);
return; }
void Gb2312ToUnicode(unsigned short* pOut,unsigned char *gbBuffer) { MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1); return; }
一个简单的例子如下
/*-----------------------------------------------*/
| GB2312 unicode table constructor |
| author: Spark Song || file : build_uni_table.c |
| date : 2005-11-18 |
/*-----------------------------------------------*/
#include <stdio.h>
#include <windows.h>
void UnicodeToGB2312(unsigned char* pOut,unsigned short uData);
void Gb2312ToUnicode(unsigned short* pOut,unsigned char *gbBuffer);
void construct_unicode_table();
int main(int argc, char *argv[])
{
construct_unicode_table();
return 0;
}
void construct_unicode_table()
{
#define GB2312_MATRIX (94)
#define DELTA (0xA0)
#define FONT_ROW_BEGIN (16 + DELTA)
#define FONT_ROW_END (87 + DELTA)
#define FONT_COL_BEGIN (1 + DELTA)
#define FONT_COL_END (GB2312_MATRIX + DELTA)
#define FONT_TOTAL (72 * GB2312_MATRIX)
int i, j;
unsigned char chr[2];
unsigned short uni;
unsigned short data[FONT_TOTAL] = {0};
int index = 0;
unsigned short buf;
//生成unicode码表
for (i=FONT_ROW_BEGIN; i<=FONT_ROW_END; i++)
for(j=FONT_COL_BEGIN; j<=FONT_COL_END; j++)
{
chr[0] = i;
chr[1] = j;
Gb2312ToUnicode(&uni, chr);
data[index] = uni; index++;
}
//排个序,以后检索的时候就可以用binary-search了
for (i=0;i<index-1; i++)
for(j=i+1; j<index; j++)
if (data[i]>data[j])
{
buf = data[i];
data[i] = data[j];
data[j] = buf;
}
//输出到STD_OUT
printf("const unsigned short uni_table[]={/n");
for (i=0; i<index; i++)
{
uni = data[i];
UnicodeToGB2312(chr, uni);
printf(" 0x%.4X%s /* GB2312 Code: 0x%.2X%.2X ==> Row:%.2d Col:%.2d *//n",
uni,
i==index-1?" ":",",
chr[0],
chr[1],
chr[0] - DELTA,
chr[1] - DELTA
);
}
printf("};/n");
return ;
}
void UnicodeToGB2312(unsigned char* pOut,unsigned short uData)
{
WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(unsigned short),NULL,NULL);
return;
}
void Gb2312ToUnicode(unsigned short* pOut,unsigned char *gbBuffer)
{
MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1);
return;
} 用 VC 编译后,在 DOS 中执行:
build_uni_table.exe > report.txt
可以得到如下的txt文件:
const unsigned short uni_table[]={ 0x4E00, /* GB2312 Code: 0xD2BB ==> Row:50 Col:27 */ 0x4E01, /* GB2312 Code: 0xB6A1 ==> Row:22 Col:01 */ 0x4E03, /* GB2312 Code: 0xC6DF ==> Row:38 Col:63 */ 0x4E07, /* GB2312 Code: 0xCDF2 ==> Row:45 Col:82 */ ... ... 0x9F9F, /* GB2312 Code: 0xB9EA ==> Row:25 Col:74 */ 0x9FA0, /* GB2312 Code: 0xD9DF ==> Row:57 Col:63 */ 0xE810, /* GB2312 Code: 0xD7FA ==> Row:55 Col:90 */ 0xE811, /* GB2312 Code: 0xD7FB ==> Row:55 Col:91 */ 0xE812, /* GB2312 Code: 0xD7FC ==> Row:55 Col:92 */ 0xE813, /* GB2312 Code: 0xD7FD ==> Row:55 Col:93 */ 0xE814 /* GB2312 Code: 0xD7FE ==> Row:55 Col:94 */ };