在日常开发过程中,经常会遇到读取某个文件的内容,相信大家也会遇到读取出现乱码的问题,如果我们在读取文件之前,先检测文件的编码格式,然后再在读取的时候设置对应的编码,是不是问题就解决了呢?经过笔者认(上)真(网)思(百)索(度)一番,于是有了今天这篇“自动识别文件编码”的文章。
示例代码
#include <QtCore/QCoreApplication>
#include <QTextCodec>
#include <QFile>
#include <QDebug>
enum class EncodingFormat : int
{
ANSI = 0,//GBK
UTF16LE,
UTF16BE,
UTF8,
UTF8BOM,
};
/*!
* \brief 检查文件编码
*/
EncodingFormat FileCharacterEncoding(const QString& fileName)
{
//假定默认编码utf8
EncodingFormat code = EncodingFormat::UTF8;
QFile file(fileName);
if (file.open(QIODevice::ReadOnly))
{
//读取3字节用于判断
QByteArray buffer = file.read(3);
quint8 sz1st = buffer.at(0);
quint8 sz2nd = buffer.at(1);
quint8 sz3rd = buffer.at(2);
if (sz1st == 0xFF && sz2nd == 0xFE)
{
code = EncodingFormat::UTF16LE;
}
else if (sz1st == 0xFE && sz2nd == 0xFF)
{
code = EncodingFormat::UTF16BE;
}
else if (sz1st == 0xEF && sz2nd == 0xBB && sz3rd == 0xBF)
{
code = EncodingFormat::UTF8BOM;
}
else
{
//尝试用utf8转换,如果无效字符数大于0,则表示是ansi编码
QTextCodec::ConverterState cs;
QTextCodec* tc = QTextCodec::codecForName("utf-8");
tc->toUnicode(buffer.constData(), buffer.size(), &cs);
code = (cs.invalidChars > 0) ? EncodingFormat::ANSI : EncodingFormat::UTF8;
}
file.close();
}
return code;
}
如何使用?
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
EncodingFormat code = FileCharacterEncoding(QStringLiteral("兰亭集序-ANSI.txt"));
qDebug() << "code=" << (int)code;
code = FileCharacterEncoding(QStringLiteral("兰亭集序-UTF-16LE.txt"));
qDebug() << "code=" << (int)code;
code = FileCharacterEncoding(QStringLiteral("兰亭集序-UTF-16BE.txt"));
qDebug() << "code=" << (int)code;
code = FileCharacterEncoding(QStringLiteral("兰亭集序-UTF-8.txt"));
qDebug() << "code=" << (int)code;
code = FileCharacterEncoding(QStringLiteral("兰亭集序-BOM-UTF-8.txt"));
qDebug() << "code=" << (int)code;
return a.exec();
}