C语言实现windows1251编码转utf-8编码

windows1251是俄罗斯本地的一种编码,不通用。mini xml好像无法解析,但客户就是提供一个这种编码文件的url,让你解析里面的数据。
开源的编码转换库又太大,只好用C语言写一个。有些字符转换没什么规律,只能一一对应,很耿直的转换方式,哈哈~~,不过以后如果遇到相同的问题,不用再写一次,节省时间。
这个程序实现将一个windows1251编码的文件转换成utf-8编码的文件。


#include <stdio.h>
#include <string.h>

// 内码小于0x80的字符转换成utf-8
void win1251char_utf8_1(unsigned char char_in, unsigned char* out)
{
	out[0] = char_in;
}

// 内码值在0x80-0xBF之间的字符转换成utf-8(中间64个字符没有规律)
void win1251char_utf8_2(unsigned char char_in, unsigned char* out)
{
	switch(char_in)
	{
		case 0x80:
			out[0] = 0xd0;
			out[1] = 0x82;
			break;
		case 0x81:
			out[0] = 0xd0;
			out[1] = 0x83;
			break;
		case 0x82:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9a;
			break;
		case 0x83:
			out[0] = 0xd1;
			out[1] = 0x93;
			break;
		case 0x84:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9e;
			break;
		case 0x85:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa6;
			break;
		case 0x86:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa0;
			break;
		case 0x87:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa1;
			break;
		case 0x88:
			out[0] = 0xe2;
			out[1] = 0x82;
			out[2] = 0xac;
			break;
		case 0x89:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xb0;
			break;
		case 0x8a:
			out[0] = 0xd0;
			out[1] = 0x89;
			break;
		case 0x8b:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xb9;
			break;
		case 0x8c:
			out[0] = 0xd0;
			out[1] = 0x8a;
			break;
		case 0x8d:
			out[0] = 0xd0;
			out[1] = 0x8c;
			break;
		case 0x8e:
			out[0] = 0xd0;
			out[1] = 0x8b;
			break;
		case 0x8f:
			out[0] = 0xd0;
			out[1] = 0x8f;
			break;
		case 0x90:
			out[0] = 0xd1;
			out[1] = 0x92;
			break;
		case 0x91:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x98;
			break;
		case 0x92:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x99;
			break;
		case 0x93:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9c;
			break;
		case 0x94:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9d;
			break;
		case 0x95:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa2;
			break;
		case 0x96:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x93;
			break;
		case 0x97:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x94;
			break;
		case 0x98:
			// 为空?
			break;
		case 0x99:
			out[0] = 0xe2;
			out[1] = 0x84;
			out[2] = 0xa2;
			break;
		case 0x9a:
			out[0] = 0xd1;
			out[1] = 0x99;
			break;
		case 0x9b:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xba;
			break;
		case 0x9c:
			out[0] = 0xd1;
			out[1] = 0x9a;
			break;
		case 0x9d:
			out[0] = 0xd1;
			out[1] = 0x9c;
			break;
		case 0x9e:
			out[0] = 0xd1;
			out[1] = 0x9b;
			break;
		case 0x9f:
			out[0] = 0xd1;
			out[1] = 0x9f;
			break;
		case 0xa0:
			out[0] = 0xc2;
			out[1] = 0xa0;
			break;
		case 0xa1:
			out[0] = 0xd0;
			out[1] = 0x8e;
			break;
		case 0xa2:
			out[0] = 0xd1;
			out[1] = 0x9e;
			break;
		case 0xa3:
			out[0] = 0xd0;
			out[1] = 0x88;
			break;
		case 0xa4:
			out[0] = 0xc2;
			out[1] = 0xa4;
			break;
		case 0xa5:
			out[0] = 0xd2;
			out[1] = 0x90;
			break;
		case 0xa6:
			out[0] = 0xc2;
			out[1] = 0xa6;
			break;
		case 0xa7:
			out[0] = 0xc2;
			out[1] = 0xa7;
			break;
		case 0xa8:
			out[0] = 0xd0;
			out[1] = 0x81;
			break;
		case 0xa9:
			out[0] = 0xc2;
			out[1] = 0xa9;
			break;
		case 0xaa:
			out[0] = 0xd0;
			out[1] = 0x84;
			break;
		case 0xab:
			out[0] = 0xc2;
			out[1] = 0xab;
			break;
		case 0xac:
			out[0] = 0xc2;
			out[1] = 0xac;
			break;
		case 0xad:
			out[0] = 0xc2;
			out[1] = 0xad;
			break;
		case 0xae:
			out[0] = 0xc2;
			out[1] = 0xae;
			break;
		case 0xaf:
			out[0] = 0xd0;
			out[1] = 0x87;
			break;
		case 0xb0:
			out[0] = 0xc2;
			out[1] = 0xb0;
			break;
		case 0xb1:
			out[0] = 0xc2;
			out[1] = 0xb1;
			break;
		case 0xb2:
			out[0] = 0xd0;
			out[1] = 0x86;
			break;
		case 0xb3:
			out[0] = 0xd1;
			out[1] = 0x96;
			break;
		case 0xb4:
			out[0] = 0xd2;
			out[1] = 0x91;
			break;
		case 0xb5:
			out[0] = 0xc2;
			out[1] = 0xb5;
			break;
		case 0xb6:
			out[0] = 0xc2;
			out[1] = 0xb6;
			break;
		case 0xb7:
			out[0] = 0xc2;
			out[1] = 0xb7;
			break;
		case 0xb8:
			out[0] = 0xd1;
			out[1] = 0x91;
			break;
		case 0xb9:
			out[0] = 0xe2;
			out[1] = 0x84;
			out[2] = 0x96;
			break;
		case 0xba:
			out[0] = 0xd1;
			out[1] = 0x94;
			break;
		case 0xbb:
			out[0] = 0xc2;
			out[1] = 0xbb;
			break;
		case 0xbc:
			out[0] = 0xd1;
			out[1] = 0x98;
			break;
		case 0xbd:
			out[0] = 0xd0;
			out[1] = 0x85;
			break;
		case 0xbe:
			out[0] = 0xd1;
			out[1] = 0x95;
			break;
		case 0xbf:
			out[0] = 0xd1;
			out[1] = 0x97;
			break;
	}
}

// 0xC0-0xFF之间的字符转换成utf-8
void win1251char_utf8_3(unsigned char char_in, unsigned char* out)
{
	if(char_in <= (unsigned char)0xef)
	{
		out[0] = 0xd0;
		out[1] = char_in - 48;
	}
	else
	{
		out[0] = 0xd1;
		out[1] = char_in - 112;
	}
}

int main()
{
	int read_len = 100, real_len = 0;
	int i = 0, k = 0;
	char buf_in[101] = {0};
	char buf_out[301] = {0};
	char buf[4] = {0};
	char* file_in = "1251.xml";
	char* file_out = "utf8.xml";
	FILE* fp_in = fopen(file_in, "rb");
	FILE* fp_out = fopen(file_out, "w");
	if(fp_in == NULL || fp_out == NULL)
	{
		printf("Open file failed!\n");
		return -1;
	}
	
	while((real_len = fread(buf_in, 1, read_len, fp_in)) != 0)
	{
		if(real_len > read_len) break;
		for(i=0,k=0;i<real_len;i++)
		{
			memset(buf, 0, sizeof(buf));
			if((unsigned char)buf_in[i] < 0x80)
			{
				buf_out[k] = buf_in[i];
				k++;
			}
			else if((unsigned char)buf_in[i] >= (unsigned char)0xc0)
			{
				win1251char_utf8_3((unsigned char)buf_in[i], buf);
				if(strlen(buf) > 3)
				{
					printf("win1251char_utf8_3 convert error!\n");
					continue;
				}
				strncpy(buf_out+k, buf, 3);
				k += strlen(buf);
			}
			else
			{
				win1251char_utf8_2((unsigned char)buf_in[i], buf);
				if(strlen(buf) > 3)
				{
					printf("win1251char_utf8_2 convert error!\n");
					continue;
				}
				strncpy(buf_out+k, buf, 3);
				k += strlen(buf);
			}
		}
		if(k != fwrite(buf_out, 1, k, fp_out))
		{
			printf("Write file failed!\n");
			break;
		}
		
		memset(buf_out, 0, sizeof(buf_out));
		memset(buf_in, 0, sizeof(buf_in));
	}
	
	fclose(fp_in);
	fclose(fp_out);
	
	return 0;
}



代码在linux下编译执行OK,成功将1251.xml文件转换成utf8.xml文件,现在用大部分编辑器都可以打开了。


  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值