windows1251是俄罗斯本地的一种编码,不通用。mini xml好像无法解析,但客户就是提供一个这种编码文件的url,让你解析里面的数据。
开源的编码转换库又太大,只好用C语言写一个。有些字符转换没什么规律,只能一一对应,很耿直的转换方式,哈哈~~,不过以后如果遇到相同的问题,不用再写一次,节省时间。
这个程序实现将一个windows1251编码的文件转换成utf-8编码的文件。
#include <stdio.h>
#include <string.h>
// 内码小于0x80的字符转换成utf-8
void win1251char_utf8_1(unsigned char char_in, unsigned char* out)
{
out[0] = char_in;
}
// 内码值在0x80-0xBF之间的字符转换成utf-8(中间64个字符没有规律)
void win1251char_utf8_2(unsigned char char_in, unsigned char* out)
{
switch(char_in)
{
case 0x80:
out[0] = 0xd0;
out[1] = 0x82;
break;
case 0x81:
out[0] = 0xd0;
out[1] = 0x83;
break;
case 0x82:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x9a;
break;
case 0x83:
out[0] = 0xd1;
out[1] = 0x93;
break;
case 0x84:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x9e;
break;
case 0x85:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0xa6;
break;
case 0x86:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0xa0;
break;
case 0x87:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0xa1;
break;
case 0x88:
out[0] = 0xe2;
out[1] = 0x82;
out[2] = 0xac;
break;
case 0x89:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0xb0;
break;
case 0x8a:
out[0] = 0xd0;
out[1] = 0x89;
break;
case 0x8b:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0xb9;
break;
case 0x8c:
out[0] = 0xd0;
out[1] = 0x8a;
break;
case 0x8d:
out[0] = 0xd0;
out[1] = 0x8c;
break;
case 0x8e:
out[0] = 0xd0;
out[1] = 0x8b;
break;
case 0x8f:
out[0] = 0xd0;
out[1] = 0x8f;
break;
case 0x90:
out[0] = 0xd1;
out[1] = 0x92;
break;
case 0x91:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x98;
break;
case 0x92:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x99;
break;
case 0x93:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x9c;
break;
case 0x94:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x9d;
break;
case 0x95:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0xa2;
break;
case 0x96:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x93;
break;
case 0x97:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0x94;
break;
case 0x98:
// 为空?
break;
case 0x99:
out[0] = 0xe2;
out[1] = 0x84;
out[2] = 0xa2;
break;
case 0x9a:
out[0] = 0xd1;
out[1] = 0x99;
break;
case 0x9b:
out[0] = 0xe2;
out[1] = 0x80;
out[2] = 0xba;
break;
case 0x9c:
out[0] = 0xd1;
out[1] = 0x9a;
break;
case 0x9d:
out[0] = 0xd1;
out[1] = 0x9c;
break;
case 0x9e:
out[0] = 0xd1;
out[1] = 0x9b;
break;
case 0x9f:
out[0] = 0xd1;
out[1] = 0x9f;
break;
case 0xa0:
out[0] = 0xc2;
out[1] = 0xa0;
break;
case 0xa1:
out[0] = 0xd0;
out[1] = 0x8e;
break;
case 0xa2:
out[0] = 0xd1;
out[1] = 0x9e;
break;
case 0xa3:
out[0] = 0xd0;
out[1] = 0x88;
break;
case 0xa4:
out[0] = 0xc2;
out[1] = 0xa4;
break;
case 0xa5:
out[0] = 0xd2;
out[1] = 0x90;
break;
case 0xa6:
out[0] = 0xc2;
out[1] = 0xa6;
break;
case 0xa7:
out[0] = 0xc2;
out[1] = 0xa7;
break;
case 0xa8:
out[0] = 0xd0;
out[1] = 0x81;
break;
case 0xa9:
out[0] = 0xc2;
out[1] = 0xa9;
break;
case 0xaa:
out[0] = 0xd0;
out[1] = 0x84;
break;
case 0xab:
out[0] = 0xc2;
out[1] = 0xab;
break;
case 0xac:
out[0] = 0xc2;
out[1] = 0xac;
break;
case 0xad:
out[0] = 0xc2;
out[1] = 0xad;
break;
case 0xae:
out[0] = 0xc2;
out[1] = 0xae;
break;
case 0xaf:
out[0] = 0xd0;
out[1] = 0x87;
break;
case 0xb0:
out[0] = 0xc2;
out[1] = 0xb0;
break;
case 0xb1:
out[0] = 0xc2;
out[1] = 0xb1;
break;
case 0xb2:
out[0] = 0xd0;
out[1] = 0x86;
break;
case 0xb3:
out[0] = 0xd1;
out[1] = 0x96;
break;
case 0xb4:
out[0] = 0xd2;
out[1] = 0x91;
break;
case 0xb5:
out[0] = 0xc2;
out[1] = 0xb5;
break;
case 0xb6:
out[0] = 0xc2;
out[1] = 0xb6;
break;
case 0xb7:
out[0] = 0xc2;
out[1] = 0xb7;
break;
case 0xb8:
out[0] = 0xd1;
out[1] = 0x91;
break;
case 0xb9:
out[0] = 0xe2;
out[1] = 0x84;
out[2] = 0x96;
break;
case 0xba:
out[0] = 0xd1;
out[1] = 0x94;
break;
case 0xbb:
out[0] = 0xc2;
out[1] = 0xbb;
break;
case 0xbc:
out[0] = 0xd1;
out[1] = 0x98;
break;
case 0xbd:
out[0] = 0xd0;
out[1] = 0x85;
break;
case 0xbe:
out[0] = 0xd1;
out[1] = 0x95;
break;
case 0xbf:
out[0] = 0xd1;
out[1] = 0x97;
break;
}
}
// 0xC0-0xFF之间的字符转换成utf-8
void win1251char_utf8_3(unsigned char char_in, unsigned char* out)
{
if(char_in <= (unsigned char)0xef)
{
out[0] = 0xd0;
out[1] = char_in - 48;
}
else
{
out[0] = 0xd1;
out[1] = char_in - 112;
}
}
int main()
{
int read_len = 100, real_len = 0;
int i = 0, k = 0;
char buf_in[101] = {0};
char buf_out[301] = {0};
char buf[4] = {0};
char* file_in = "1251.xml";
char* file_out = "utf8.xml";
FILE* fp_in = fopen(file_in, "rb");
FILE* fp_out = fopen(file_out, "w");
if(fp_in == NULL || fp_out == NULL)
{
printf("Open file failed!\n");
return -1;
}
while((real_len = fread(buf_in, 1, read_len, fp_in)) != 0)
{
if(real_len > read_len) break;
for(i=0,k=0;i<real_len;i++)
{
memset(buf, 0, sizeof(buf));
if((unsigned char)buf_in[i] < 0x80)
{
buf_out[k] = buf_in[i];
k++;
}
else if((unsigned char)buf_in[i] >= (unsigned char)0xc0)
{
win1251char_utf8_3((unsigned char)buf_in[i], buf);
if(strlen(buf) > 3)
{
printf("win1251char_utf8_3 convert error!\n");
continue;
}
strncpy(buf_out+k, buf, 3);
k += strlen(buf);
}
else
{
win1251char_utf8_2((unsigned char)buf_in[i], buf);
if(strlen(buf) > 3)
{
printf("win1251char_utf8_2 convert error!\n");
continue;
}
strncpy(buf_out+k, buf, 3);
k += strlen(buf);
}
}
if(k != fwrite(buf_out, 1, k, fp_out))
{
printf("Write file failed!\n");
break;
}
memset(buf_out, 0, sizeof(buf_out));
memset(buf_in, 0, sizeof(buf_in));
}
fclose(fp_in);
fclose(fp_out);
return 0;
}
代码在linux下编译执行OK,成功将1251.xml文件转换成utf8.xml文件,现在用大部分编辑器都可以打开了。