C++检查文件编码的两种方法,第一种为QT的方法,方法来自 gitee的QT开源项目,第二种为百度的C++方法,推荐第一种。
//检查文件编码 0=ANSI 1=UTF-16LE 2=UTF-16BE 3=UTF-8 4=UTF-8BOM
QT的方法
int DataCsv::findCode(const QString &fileName)
{
//假定默认编码utf8
int code = 3;
QFile file(fileName);
if (file.open(QIODevice::ReadOnly)) {
//读取3字节用于判断
QByteArray buffer = file.read(3);
quint8 b1 = buffer.at(0);
quint8 b2 = buffer.at(1);
quint8 b3 = buffer.at(2);
if (b1 == 0xFF && b2 == 0xFE) {
code = 1;
} else if (b1 == 0xFE && b2 == 0xFF) {
code = 2;
} else if (b1 == 0xEF && b2 == 0xBB && b3 == 0xBF) {
code = 4;
} else {
//尝试用utf8转换,如果可用字符数大于0,则表示是ansi编码
QTextCodec::ConverterState state;
QTextCodec *codec = QTextCodec::codecForName("utf-8");
codec->toUnicode(buffer.constData(), buffer.size(), &state);
if (state.invalidChars > 0) {
code = 0;
}
}
file.close();
}
return code;
}
// 纯C++的方法(推荐使用上面的函数,简单)
//检查是否为无BOM的UTF8
bool check_utf8_without_bom(const string& file_name)
{
ifstream file_in;
file_in.open(file_name, ios::in);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;
return false;
}
stringstream buffer;
buffer << file_in.rdbuf();
file_in.close();
string text = buffer.str();
size_t len = text.size();
int n = 0;
unsigned char ch;
bool b_all_ascii = true;
//0x00-0x7F为ASCII码范围
for (size_t i = 0; i < len; ++i)
{
ch = text[i];
if ((ch & 0x80) != 0)
{
b_all_ascii = false;
}
if (n == 0)
{
if (ch >= 0x80)
{
if (ch >= 0xFC && ch <= 0xFD)
{
n = 6;
}
else if (ch >= 0xF8)
{
n = 5;
}
else if (ch >= 0xF0)
{
n = 4;
}
else if (ch >= 0xE0)
{
n = 3;
}
else if (ch >= 0xC0)
{
n = 2;
}
else
{
return false;
}
n--;
}
}
else
{
if ((ch & 0xC0) != 0x80)//在UTF-8中,以位模式10开始的所有字节是多字节序列的后续字节
{
return false;
}
n--;
}
}
if (n > 0)
{
return false;
}
if (b_all_ascii)
{
return false;
}
return true;
}
//检查文本编码
enum TEXT_TYPE
{
TEXT_ANSI = 0,
TEXT_UTF8 = 1,
TEXT_UTF8_BOM = 2,
TEXT_UTF16_LE = 3,
TEXT_UTF16_BE = 4,
TEXT_UNKNOW = 5,
};
TEXT_TYPE check_text_encode(const string& file_name)
{
/*
ANSI 无格式定义 对于中文编码格式是GB2312;
Unicode little endian 文本里前两个字节为FF FE 字节流是little endian
Unicode big endian 文本里前两个字节为FE FF 字节流是big endian
UTF-8带BOM 前两字节为EF BB,第三字节为BF 带BOM
UTF-8不带BOM 无格式定义,需另加判断 不带BOM
*/
ifstream file_in(file_name, ios::binary);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;;
return TEXT_UNKNOW;
}
int head;
unsigned char ch;
file_in.read((char*)&ch, sizeof(ch));
head = ch << 8;
file_in.read((char*)&ch, sizeof(ch));
head |= ch;
file_in.close();
TEXT_TYPE result_code = TEXT_UNKNOW;
switch (head)
{
case 0xFFFE:
result_code = TEXT_UTF16_LE;
break;
case 0xFEFF:
result_code = TEXT_UTF16_BE;
break;
case 0xEFBB:
result_code = TEXT_UTF8_BOM;
break;
default:
if (check_utf8_without_bom(file_name))
result_code = TEXT_UTF8;
else
result_code = TEXT_ANSI;
break;
}
return result_code;
}