1.概述
C++实现对utf-8,utf-8(BOM),utf-16 LE,utf-16 BE编码的文本文件进行编码判断,其中多数编码均有固定文件头,在此不赘述,难点在于utf-8(无BOM)的判断。
2.代码实现
///传入参数为文件名,返回值为编码名称
string readFile(QString fileName)
///判断字符串是否为utf-8(无BOM)编码,size为字节长度
bool IsUTF8WithoutBOM(char* _data,int size)
void readFile(QString fileName)
{
ifstream fin(fileName.toStdString().c_str());
unsigned char s2;
fin.read((char*)&s2,1);
int p = s2<<8;
fin.read((char*)&s2,1);
p|=s2;
fin.close();
string code;
switch(p)
{
case 0xfffe:
code ="UTF-16 LE";
break;
case 0xfeff:
code ="UTF-16 BE";
break;
case 0xefbb:
code ="utf-8 with BOM";
break;
default:
QFile file(fileName);
if(file.open(QIODevice::ReadOnly))
{
int size = file.bytesAvailable();
char* buffer = new char[size];
file.read(buffer,size);
if(IsUTF8WithoutBOM(buffer,size))
{
code = "UTF-8 without BOM";
break;
}
}
code = "GBK";
break;
}
return code;
}
bool IsUTF8WithoutBOM(char* _data,int size)
{
char *data = new char[size];
memset(data,0,size);
memcpy(data,_data,size);
int encodingBytesCount = 0;
bool allTextsAreASCIIChars = true;
for(int i=0;i<size;i++)
{
char current = data[i];
//判断是否都是ASCII字符,当包括中文字符时allTextsAreASCIIChars为false
if ((current & 0x80) == 0x80)
allTextsAreASCIIChars = false;
//判断是否为一个字符的开始字节
if (encodingBytesCount == 0)
{
if ((current & 0x80) == 0)
{
// ASCII chars, from 0x00-0x7F
continue;
}
if ((current & 0xC0) == 0xC0) ///大于等于两字节
{
encodingBytesCount = 1;
current <<= 2;
// More than two bytes used to encoding a unicode char.
// Calculate the real length.
while ((current & 0x80) == 0x80)
{
current <<= 1; //判断下一位是否为1
encodingBytesCount++; //当前字符编码字节数
}
}
else
{
// Invalid bits structure for UTF8 encoding rule.
return false;
}
}
else
{
// Following bytes, must start with 10.
if ((current & 0xC0) == 0x80) ///当前字节是否以10开头
{
encodingBytesCount--;
}
else
{
return false;
}
}
}
if (encodingBytesCount != 0)
{
// Invalid bits structure for UTF8 encoding rule.
// Wrong following bytes count.
return false;
}
return !allTextsAreASCIIChars;
}
3.utf-8的编码规则
代码中关键点就是通过encodingBytesCount 来判断当前字节是否为一个字符的开始字节,已给出注释