识别文本文件的编码
enum ENCODETYPE { UNKNOW, ANSI, UTF8, UTF16_big_endian, UTF16_little_endian }; ENCODETYPE TellEncodeType(BYTE* pBuf,int bufLen) { ENCODETYPE filetype=ANSI; if (pBuf[0]==0xFF && pBuf[1]==0xFE) //fffe,小头,windows默认 filetype=UTF16_little_endian; else if(pBuf[0]==0xFE && pBuf[1]==0xFF ) filetype=UTF16_big_endian; else { int utf8Nums=0; //符合UTF8编码的字符个数,非Ansi部分 int count=0; while(count<bufLen-2) { int i=0; while( i<bufLen-2-count) { if (pBuf[count+i]>0xC0) { if (pBuf[count+i+1]<0x80 || pBuf[count+i+1]>0xC0) { filetype=ANSI; break; } else { /* The transformation table for UTF-8 is presented below: UNICODE UTF-8 00000000 - 0000007F 0xxxxxxx 00000080 - 000007FF 110xxxxx 10xxxxxx 00000800 - 0000FFFF 1110xxxx 10xxxxxx 10xxxxxx //0xE0 00010000 - 001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx //0xF0 00200000 - 03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx //0xF8 04000000 - 7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx //0xFC */ BYTE *tC;//target Char to 识别 int utfStrLen=0; //是否为正确的UTF8格式 tC=pBuf+count+i; if (tC[0]<=0xF0)//<4字节 if(tC[0]>0xE0) utfStrLen=3; else utfStrLen=2; else if (tC[0]>=0xFC) utfStrLen=6; else if(tC[0]>=0xF8) utfStrLen=5; else utfStrLen=4; int k=1; while(k<utfStrLen) { if (!(tC[k] & 0x80 && !(tC[k]& 0x40) ))//前二位为10 { filetype=ANSI; break; } k++; } if (k==utfStrLen) utf8Nums++; if (utf8Nums==10) filetype=UTF8; } } i++; }//while( i<bufLen-2-count) ++count; }//while(count<bufLen-2) }//else return filetype; } int main() { ENCODETYPE filetype=UNKNOW; BYTE * pBuf; FILE * pFile; int filesize; pFile = _tfopen( pstrPath, _T("rb") ); if (pFile) { //get the file size fseek(pFile,0,SEEK_END); filesize=ftell(pFile); filesize+=sizeof(TCHAR); pBuf=(BYTE*)malloc(filesize); memset(pBuf+filesize-sizeof(TCHAR),0,sizeof(TCHAR)); fseek(pFile,0,SEEK_SET); fread(pBuf,1,filesize-sizeof(TCHAR),pFile); filetype=TellEncodeType(pBuf,filesize); fclose (pFile); } }