ANSI编码转UTF8编码 实现代码
有时因配置表默认保存编码格式为ANSI格式,汉字等非ANSI码会在其它语言系统下显示为乱码,所以许多开源库或工具均用UTF8作为编码方式,即解决乱码问题又节省资源.但由于配置表由许多人来维护,经常没把配置表以UTF8格式保存,造成许多显示上的麻烦,于是写点代码,来进行批量转换.
下面的一个函数为一核心转换函数:
void ansi_to_utf8(TCHAR file_full_name[])
{
wcout<<"convert file : "<<file_full_name<<endl;
DWORD dwAttrs = GetFileAttributes(file_full_name);
if (dwAttrs==INVALID_FILE_ATTRIBUTES) return;
if ((dwAttrs & FILE_ATTRIBUTE_READONLY))
{
SetFileAttributes(file_full_name,
FILE_ATTRIBUTE_NORMAL);
}
fstream in_stream;
char inBOM[3];
in_stream.open(file_full_name);
in_stream.read(inBOM,3);
if (inBOM[0] + 256==0xef && inBOM[1]+256==0xbb && inBOM[2]+256==0xbf) //UTF8
{
wcout<<file_full_name<<" is already UTF8"<<endl;
in_stream.close();
return;
}
else
{
in_stream.seekg(0);
char *ansi_text = NULL;
TCHAR *unicdoe_text = NULL;
char *utf8_text = NULL;
int ansi_text_length = 524288;
ansi_text = new char[ansi_text_length];
int utf8_text_length = 0;
in_stream.seekg(0);
in_stream.read(ansi_text,ansi_text_length);
ansi_text_length = in_stream.gcount();
in_stream.close();
ansi_text[ansi_text_length] = '/0';
int unicode_text_length = MultiByteToWideChar( CP_ACP,NULL,ansi_text,ansi_text_length,NULL,0);
unicdoe_text = new TCHAR[unicode_text_length+1];
MultiByteToWideChar(CP_ACP, NULL, ansi_text, ansi_text_length, unicdoe_text, unicode_text_length);
unicdoe_text[unicode_text_length] = TCHAR('/0');
utf8_text_length = WideCharToMultiByte(CP_UTF8, NULL, unicdoe_text, unicode_text_length, NULL,0,NULL,NULL);
utf8_text = new char[utf8_text_length+4];
utf8_text[0] = 0xef;
utf8_text[1] = 0xbb;
utf8_text[2] = 0xbf;
WideCharToMultiByte(CP_UTF8, NULL, unicdoe_text, unicode_text_length, &utf8_text[3], utf8_text_length,NULL,NULL);
utf8_text_length += 3;
utf8_text[utf8_text_length] = '/0';
ofstream out_stream;
out_stream.open(file_full_name);
out_stream.write(utf8_text, utf8_text_length);
out_stream.close();
if (ansi_text)
{
delete []ansi_text;
}
if (utf8_text)
{
delete []utf8_text;
}
if (unicdoe_text)
{
delete []unicdoe_text;
}
}
}
传入一个文本文件名,会先把其只读属性去掉,然后依靠BOM来识别其是否为ANSI格式还是UTF8格式,对于其它格式如UTF16,UTF32则会错误地处理,跟据需要可以加上判断,由于我要处理的文件里肯定没有UTF16,UTF18等格式,因此就偷懒了~~如有需要,则务必加上. 读取文件里内容后,先把多字节转成宽字节UNICODE,然后再把宽字节转成UTF8,函数均为WIN32 系统API, 由于时间匆忙,代码很粗糙,仅供参考其原理.
下面是我把当前目录或SYSTEM目录下所有ANSI格式的.CSV格式文件转成UTF8编码的完整程序.漏洞多多,注意只是参考,如有使用,后果自负~.~
/*
功能:将当前目录或子SYSTEM目录内的所有ANSI格式CSV文件,转换成UTF8格式
作者:qloach@foxmail.com
时间:2009-10-15
*/
#include<iostream>
#include <fstream>
using namespace std;
#include <Windows.h>
bool is_csv_file(TCHAR file_name[])
{
for (int i = 4 ; i < MAX_PATH; ++i)
{
if (file_name[i]==TCHAR('/0'))
{
if (file_name[i-4]==TCHAR('.')
&& (file_name[i-3]==TCHAR('c')||file_name[i-3]==TCHAR('C'))
&& (file_name[i-2]==TCHAR('s')||file_name[i-2]==TCHAR('S'))
&& (file_name[i-1]==TCHAR('v')||file_name[i-1]==TCHAR('V')))
{
return true;
}
else
{
return false;
}
}
}
return false;
}
void connect_string(TCHAR des[], TCHAR buf1[], TCHAR buf2[])
{
memcpy(des,buf1,MAX_PATH);
bool bAdd = false;
for (int i = 1,j = 0; i < MAX_PATH; ++i)
{
if(bAdd == false && (des[i]==TCHAR('/0') || des[i] ==TCHAR('*')))
{
bAdd = true;
}
if (!bAdd)
{
continue;
}
des[i] = buf2[j];
if (buf2[j] == TCHAR('/0'))
{
return;
}
++j;
}
}
void ansi_to_utf8(TCHAR file_full_name[])
{
wcout<<"convert file : "<<file_full_name<<endl;
DWORD dwAttrs = GetFileAttributes(file_full_name);
if (dwAttrs==INVALID_FILE_ATTRIBUTES) return;
if ((dwAttrs & FILE_ATTRIBUTE_READONLY))
{
SetFileAttributes(file_full_name,
FILE_ATTRIBUTE_NORMAL);
}
fstream in_stream;
char inBOM[3];
in_stream.open(file_full_name);
in_stream.read(inBOM,3);
if (inBOM[0] + 256==0xef && inBOM[1]+256==0xbb && inBOM[2]+256==0xbf) //UTF8
{
wcout<<file_full_name<<" is already UTF8"<<endl;
in_stream.close();
return;
}
else
{
in_stream.seekg(0);
char *ansi_text = NULL;
TCHAR *unicdoe_text = NULL;
char *utf8_text = NULL;
int ansi_text_length = 524288;
ansi_text = new char[ansi_text_length];
int utf8_text_length = 0;
in_stream.seekg(0);
in_stream.read(ansi_text,ansi_text_length);
ansi_text_length = in_stream.gcount();
in_stream.close();
ansi_text[ansi_text_length] = '/0';
int unicode_text_length = MultiByteToWideChar( CP_ACP,NULL,ansi_text,ansi_text_length,NULL,0);
unicdoe_text = new TCHAR[unicode_text_length+1];
MultiByteToWideChar(CP_ACP, NULL, ansi_text, ansi_text_length, unicdoe_text, unicode_text_length);
unicdoe_text[unicode_text_length] = TCHAR('/0');
utf8_text_length = WideCharToMultiByte(CP_UTF8, NULL, unicdoe_text, unicode_text_length, NULL,0,NULL,NULL);
utf8_text = new char[utf8_text_length+4];
utf8_text[0] = 0xef;
utf8_text[1] = 0xbb;
utf8_text[2] = 0xbf;
WideCharToMultiByte(CP_UTF8, NULL, unicdoe_text, unicode_text_length, &utf8_text[3], utf8_text_length,NULL,NULL);
utf8_text_length += 3;
utf8_text[utf8_text_length] = '/0';
ofstream out_stream;
out_stream.open(file_full_name);
out_stream.write(utf8_text, utf8_text_length);
out_stream.close();
if (ansi_text)
{
delete []ansi_text;
}
if (utf8_text)
{
delete []utf8_text;
}
if (unicdoe_text)
{
delete []unicdoe_text;
}
}
}
void search_all_files(TCHAR foldername[])
{
WIN32_FIND_DATA FindFileData;
HANDLE hFind = INVALID_HANDLE_VALUE;
TCHAR DirSpec[MAX_PATH]; // directory specification
DWORD dwError;
wcout<<L"seacrch folder is:"<<foldername<<endl;
memcpy(DirSpec, foldername, MAX_PATH);
hFind = FindFirstFile(DirSpec, &FindFileData);
if (hFind == INVALID_HANDLE_VALUE)
{
wcout<<"Invalid file handle. Error is"<<GetLastError();
return;
}
else
{
FindNextFile(hFind, &FindFileData);
while (FindNextFile(hFind, &FindFileData) != 0)
{
if (is_csv_file(FindFileData.cFileName))
{
TCHAR full_name[MAX_PATH];
full_name[49] = TCHAR('/0');
connect_string(full_name, foldername, FindFileData.cFileName);
ansi_to_utf8(full_name);
//wcout<<"Next file name is "<<FindFileData.cFileName<<endl;
}
}
dwError = GetLastError();
FindClose(hFind);
if (dwError != ERROR_NO_MORE_FILES)
{
wcout<<"FindNextFile error. Error is"<<dwError;
return;
}
}
}
int main()
{
TCHAR path[MAX_PATH];
TCHAR des_path[MAX_PATH];
GetCurrentDirectory(230,path);
connect_string(des_path, path,L"//*");
search_all_files(des_path);
// sub system directory
TCHAR sub_path[MAX_PATH];
connect_string(sub_path, path, L"//system//*");
search_all_files(sub_path);
wcout<<"convert success!"<<endl;
Sleep(1500);
return 0;
}