（c语言）gb2312和utf8转换

最新推荐文章于 2024-02-18 14:38:52 发布

4444ge

最新推荐文章于 2024-02-18 14:38:52 发布

阅读量1.1w

点赞数 1

分类专栏： c语言文章标签： c语言

本文链接：https://blog.csdn.net/I_study_ing/article/details/62091488

版权

c语言专栏收录该内容

1 篇文章 0 订阅

订阅专栏

（c语言）unicode和utf8转换

unicode和utf8转换规则

unicode与gb2312有着转换表

所以，只需要unicode和utf8之间进行转换即可

一、所以gb2312转utf8

void Gb2312ToUtf8(const char* input_file, const char *output_file)
{
printf("gb2312->utf8: \n");
//请在此处添加代码完成gb2312到utf8的转换
int byteCount=0;
int i=0;
int j=0;
u16 gbKey=0;
u16 unicodeKey=0;
long len;
FILE *fpIn=fopen(input_file,"rb");
if(fpIn==NULL){
printf("Unable to open the input file!\n");
return;
}
else{
fseek( fpIn, 0L, SEEK_END );
len = ftell( fpIn );
printf( "intput file size: %ldB\n", len );
}
FILE* fpOut=fopen(output_file,"wb");
if(fpOut==NULL)
{
printf("Unable to open the output file!\n");
return;
}

u8 *gb,*temp;
gb=new u8[len*2];
temp=new u8[len*2];
fseek(fpIn,0L,SEEK_SET);
fread(gb,sizeof(u8),len,fpIn);
int count=0;
while(i<len){

memcpy(&gbKey,(gb+i),2);
gbKey=(gbKey >> 8) | (gbKey << 8);
unicodeKey=SearchCodeTable_GB2312(gbKey);
byteCount=0;

//unicodeKey->utf-8
if(unicodeKey==0){
printf("fail:table can not find the key: 0x%x \n",gbKey);
count++;
temp[j]=gb[i];
j++;
}
else {
if(unicodeKey<=0x0000007F){
temp[j]=unicodeKey&0x7F;
byteCount=1;
continue;
}
else if(unicodeKey>=0x00000080&&unicodeKey<0x000007FF){
temp[j+1]=(unicodeKey&0x3F)|0x80;
temp[j]=((unicodeKey>>6)&0x1F)|0xC0;
byteCount=2;
continue;
}
else if(unicodeKey>=0x00000800&&unicodeKey<=0x0000FFFF){
temp[j+2]=(unicodeKey&0x3F)|0x80;
temp[j+1]=((unicodeKey>>6)&0x3F)|0x80;
temp[j]=((unicodeKey>>12)&0x0F)|0xE0;
byteCount=3;
i++;
continue;
}
else if(unicodeKey>=0x00010000&&unicodeKey<=0x0010FFFF){
temp[j+3]=(unicodeKey&0x3F)|0x80;
temp[j+2]=((unicodeKey>>6)&0x3F)|0x80;
temp[j+1]=((unicodeKey>>12)&0x3F)|0x80;
temp[j]=((unicodeKey>>18)&0xF7);
byteCount=4;
continue;
}
else if(unicodeKey>=0x00200000&&unicodeKey<=0x03FFFFFF){
temp[j+4]=(unicodeKey&0x3F)|0x80;
temp[j+3]=((unicodeKey>>6)&0x3F)|0x80;
temp[j+2]=((unicodeKey>>12)&0x3F)|0x80;
temp[j+1]=((unicodeKey>>18)&0x3F)|0x80;
temp[j]=((unicodeKey>>24)&0xF7);
byteCount=5;
continue;
}
else if(unicodeKey>=0x04000000&&unicodeKey<=0x7FFFFFFF){
temp[j+5]=(unicodeKey&0x3F)|0x80;
temp[j+4]=((unicodeKey>>6)&0x3F)|0x80;
temp[j+3]=((unicodeKey>>12)&0x3F)|0x80;
temp[j+2]=((unicodeKey>>18)&0x3F)|0x80;
temp[j+1]=((unicodeKey>>24)&0x3F)|0x80;
temp[j]=((unicodeKey>>30)&0xF7);
byteCount=6;
continue;
}
else{
printf("out of unicodeKey ! \n");
continue;
}
}

j+=byteCount;
i+=1;

}
printf("There are %d wrong!",count);
fwrite(temp, sizeof(u8),j, fpOut);
delete []gb;
delete []temp;
fclose(fpIn);
fclose(fpOut);

}

二、utf8转gb2312

void Utf8ToGb2312(const char* input_file, const char *output_file)
{
printf("utf8->unicode: \n");

int byteCount = 0;
int i = 0;
int j = 0;
u16 unicodeKey = 0;
u16 gbKey = 0;

long len;
FILE* fpIn=fopen(input_file,"rb");
if(fpIn==NULL)
{
printf("Unabile to open the input file!\n");
return;
}
else
{
// 将指针定位到文件末尾
fseek( fpIn, 0L, SEEK_END );
len = ftell( fpIn );
printf( "intput file size: %ldB\n", len );
}
FILE* fpOut=fopen(output_file,"wb");
if(fpOut==NULL)
{
printf("Unabile to open the output file!\n");
return;
}

u8 *utf8,*temp;
utf8=new u8[len];
temp=new u8[len];
fseek( fpIn, 0L, SEEK_SET );
fread(utf8, sizeof(u8),len,fpIn);

i=3;
while (i < len)
{
switch(GetUtf8ByteNumForWord((u8)utf8[i]))
{
case 0:
temp[j] = utf8[i];
byteCount = 1;
break;

case 2:
temp[j] = utf8[i];
temp[j + 1] = utf8[i + 1];
byteCount = 2;
break;

case 3:
//这里就开始进行UTF8->Unicode
temp[j + 1] = ((utf8[i] & 0x0F) << 4) | ((utf8[i + 1] >> 2) & 0x0F);
temp[j] = ((utf8[i + 1] & 0x03) << 6) + (utf8[i + 2] & 0x3F);

//取得Unicode的值
memcpy(&unicodeKey, (temp + j), 2);
// printf("unicode key is: 0x%04X\n", unicodeKey);

//根据这个值查表取得对应的GB2312的值
gbKey = SearchCodeTable(unicodeKey);
// printf("gb2312 key is: 0x%04X\n", gbKey);

if (gbKey != 0)
{
//here change the byte
//不为0表示搜索到，将高低两个字节调换调成我要的形式
gbKey = (gbKey >> 8) | (gbKey << 8);
// printf("after changing, gb2312 key is: 0x%04X\n", gbKey);
memcpy((temp + j), &gbKey, 2);
}

byteCount = 3;
break;

case 4:
byteCount = 4;
break;
case 5:
byteCount = 5;
break;
case 6:
byteCount = 6;
break;

default:
printf("the len is more than 6\n");
break;
}

i += byteCount;
if (byteCount == 1)
{
j++;
}
else
{
j += 2;
}

}

fwrite(temp, sizeof(u8),j, fpOut);

delete []utf8;
delete []temp;
fclose(fpIn);
fclose(fpOut);
}

4444ge

关注

1
点赞
踩
12

收藏

觉得还不错? 一键收藏
4
评论
（c语言）gb2312和utf8转换

（c语言）unicode和utf8转换unicode和utf8转换规则unicode与gb2312有着转换表所以，只需要unicode和utf8之间进行转换即可一、所以gb2312转utf8void Gb2312ToUtf8(const char* input_file, const char *output_file){printf("
复制链接

扫一扫