对Win32或MFC项目,在winnt.h头文件中声明了一个宏:
//
// Neutral ANSI/UNICODE types and macros
//
#ifdef UNICODE // r_winnt
//当项目字符集属性选择"使用Unicode字符集"时
#ifndef _TCHAR_DEFINED
typedef WCHAR TCHAR, *PTCHAR; //此时TCHAR类型就是WCHAR,也就是宽字符wchar_t
typedef WCHAR TBYTE , *PTBYTE ;
#define _TCHAR_DEFINED
#endif /* !_TCHAR_DEFINED */
typedef LPWCH LPTCH, PTCH;
typedef LPCWCH LPCTCH, PCTCH;
typedef LPWSTR PTSTR, LPTSTR;
typedef LPCWSTR PCTSTR, LPCTSTR;
typedef LPUWSTR PUTSTR, LPUTSTR;
typedef LPCUWSTR PCUTSTR, LPCUTSTR;
typedef LPWSTR LP;
typedef PZZWSTR PZZTSTR;
typedef PCZZWSTR PCZZTSTR;
typedef PUZZWSTR PUZZTSTR;
typedef PCUZZWSTR PCUZZTSTR;
typedef PNZWCH PNZTCH;
typedef PCNZWCH PCNZTCH;
typedef PUNZWCH PUNZTCH;
typedef PCUNZWCH PCUNZTCH;
#define __TEXT(quote) L##quote // r_winnt
#else /* UNICODE */ // r_winnt
//当项目字符集属性选择"使用多字节字符集"时,即ANSI编码
#ifndef _TCHAR_DEFINED
typedef char TCHAR, *PTCHAR; //此时TCHAR就是ASCii字符char
typedef unsigned char TBYTE , *PTBYTE ;
#define _TCHAR_DEFINED
#endif /* !_TCHAR_DEFINED */
typedef LPCH LPTCH, PTCH;
typedef LPCCH LPCTCH, PCTCH;
typedef LPSTR PTSTR, LPTSTR, PUTSTR, LPUTSTR;
typedef LPCSTR PCTSTR, LPCTSTR, PCUTSTR, LPCUTSTR;
typedef PZZSTR PZZTSTR, PUZZTSTR;
typedef PCZZSTR PCZZTSTR, PCUZZTSTR;
typedef PNZCH PNZTCH, PUNZTCH;
typedef PCNZCH PCNZTCH, PCUNZTCH;
#define __TEXT(quote) quote // r_winnt
#endif /* UNICODE */ // r_winnt
其中WCHAR分别就是wchar_t
//
// UNICODE (Wide Character) types
//
#ifndef _MAC
typedef wchar_t WCHAR; // wc, 16-bit UNICODE character
#else
// some Macintosh compilers don't define wchar_t in a convenient location, or define it as a char
typedef unsigned short WCHAR; // wc, 16-bit UNICODE character
#endif
在ATL/MFC项目中用到的宽字符(2个字节)串CStringW类与多字节(1个或2个字节)字符串CStringA类的定义:
typedef ATL::CStringT< wchar_t, StrTraitMFC_DLL< wchar_t > > CStringW;
typedef ATL::CStringT< char, StrTraitMFC_DLL< char > > CStringA;
typedef ATL::CStringT< TCHAR, StrTraitMFC_DLL< TCHAR > > CString;
有个陈年项目使用了多字节字符集(ANSI),但其字符处理函数都是面向char的,CString或CStringA转char时宽字符会被当做两个独立的char字节,并被VS2010按ASCii码进行解释,从而显示出'?'。
CStringA、CStringW转char[]、wchar_t[]参看Microsoft Docs——How to: Convert Between Various String Types
当项目属性分别使用"多字节字符集"、"Unicode字符集"时,_T()宏将字符串字面值按"ANSI(GB2312)码"、"Unicode码"进行编码,L宏始终将字符串字面值按Unicode码进行编码,不加任何宏的字符串"xxxxx"始终按ANSI(GB2312)码对单字节字符或宽字符进行编码;
在调试监视变量时VS2010将对char值以"ASCii码"解释(就是光标移动到变量时的解释值),对wchar_t值以"Unicode码"解释。此时,当宽字符被转成char后(比如ATL/MFC的CStringW转char[])进行解释会因匹配不到ASCii码而在编译器中显示'?'。赋给char[]或TCHAR[](项目使用多字节字符集时)的宽字符的字面值将按ANSI(GB2312)码存储,即将宽字符的两个字节当做两个"单字节ASCii字符"存储,但翻译时尝试按ASCii码进行解释,找不到时编译器会显示'?',ASCii字符仍按ASCii码存储;项目使用Unicode字符集时,赋给wchar_t[]或TCHAR[](项目使用Unicode字符集时)的宽字符的字面值将按Unicode码存储,翻译时自然按Unicode码进行翻译,编译器自然能正确显示;
示例代码放在一个MFC项目中:
char str1[]="中600"; //始终按ANSI(GB2312)码存储,内存D6 D0 36 30 30 00
TCHAR str2[]=_T("中600");//项目使用多字节字符集时,按ANSI(GB2312)码存储,内存D6 D0 36 30 30 00
//项目使用Unicode字符集时,按Unicode存储,内存2D 4E 36 00 30 00 30 00 00 00
wchar_t str3[]=L"中600"; //始终按Unicode存储,内存2D 4E 36 00 30 00 30 00 00 00
//ATL/MFC多字节(multibyte)字符串CStringA转char[]将二字节宽字符拆开为单字节字符,所以可能因匹配不到ASCii码而乱码
// Convert to a char* string from CStringA string
// and display the result.
//'中'的UNICODE编码码是/u:4E2D,ANSI(GB2312)编码D6 D0
CStringA cstrA(_T("中600")); //多字节字符集下cstrA在内存中是D6 D0 36 30 30 00
size_t lencstrA=cstrA.GetLength(); //5
//cout << cstrA << " (CStringA)" << endl;
const size_t newsizea = (cstrA.GetLength() + 1);
char *nstringa = new char[newsizea];
strcpy_s(nstringa, newsizea, cstrA); //多字节字符集下nstringa在内存中是D6 D0 36 30 30 00
//cstrA将字面值"中"按ANSI(GB2312)编码存储为D6 D0,转成char后VS2010将char以ASCii码进行翻译显示
//因解释器认为char字符自然是码值0-127的首位为0的单字节,由于正整数的补码是其本身,所以将D6的补码-42当作原码去解释
char a1=nstringa[0]; //-42'?'因翻译不出单字节ASCii字符所以显示'?' -42是补码1010 1010 反码1010 1001 原码1101 0110对应D6
char a2=nstringa[1]; //-48'?'因翻译不出单字节ASCii字符所以显示'?' -48是补码1011 0000 反码1010 1111 原码1101 0000对应D0
char c3=nstringa[2]; // 54'6'
//cout << nstringa << " (char *)" << endl;
//ATL/MFC宽字符(wide character)串CStringW转char[]并不会截断低字节或高字节,直接成空字符串
// Convert to a char* string from a wide character
// CStringW string. To be safe, we allocate two bytes for each
// character in the original string, including the terminating
// null.
CStringW cstrW(_T("中600")); //多字节字符集下cstrW在内存中是2D 4E 36 00 30 00 30 00 00;确实是小端序(低字节在前)
size_t lenstrw=cstrW.GetLength(); //4
const size_t newsizew = (cstrW.GetLength() + 1)*2; //(字符数+一个null终止符'/0/0')*2
char *nstringw = new char[newsizew];
size_t convertedCharsw = 0;
wcstombs_s(&convertedCharsw, nstringw, newsizew, cstrW, _TRUNCATE ); //nstringw在内存中是00,即宽字符串转char[]没截断,直接空字符串
//cout << nstringw << " (char *)" << endl;
//ATL/MFC变字节字符串CString(ANSI字符集时就是CStringA)转char[]将二字节宽字符(如果有的话)拆开为单字节字符,所以可能因匹配不到ASCii码而乱码
//Convert to a char* from CString
CString cstr(_T("中600")); //多字节字符集下cstr在内存中是D6 D0 36 30 30 00
size_t lenstr2=cstr.GetLength(); //5
const size_t newsize = cstr.GetLength()+1;
//char *text = new char[newsize];
char text[100];
strncpy_s(text,cstr,cstr.GetLength()+1); //多字节字符集下text在内存中是D6 D0 36 30 30 00,D6 D0对应1101 0110 - 1101 0000
//项目使用多字节字符集时,cstr(此时就是CStringA)将字面值"中"按ANSI(GB2312)编码存储为D6 D0,转成char后VS2010将char以ASCii码进行翻译显示
//项目使用Unicode字符集时,cstr(此时就是CStringW)将字面值"中"按Unicode编码存储为2D 4E 36 00 30 00 30 00 00 00
//因解释器认为char字符自然是码值0-127的首位为0的单字节,由于正整数的补码是其本身,所以将D6的补码-42当作原码去解释
char b1=text[0]; //-42因翻译不出单字节ASCii字符所以显示'?' -42是补码1010 1010 <-反码1010 1001 <-原码1101 0110对应D6
char b2=text[1]; //-48因翻译不出单字节ASCii字符所以显示'?' -48是补码1011 0000 <-反码1010 1111 <-原码1101 0000对应D0
char b3=text[2]; //显示54 '6',对应/u:36
char b4=text[3]; //显示48 '0',对应/u:30
//ATL/MFC多字节(multibyte)字符串CStringA转wchar_t[]将二字节宽字符当作两个单字节ASCii字符扩展
// Convert to a wchar_t* from CStringA
//CStringA cstrA(_T("中600")); //多字节字符集下cstrA在内存中是D6 D0 36 30 30 00
//const size_t newsizea = (cstrA.GetLength() + 1);
size_t convertedCharsa = 0;
wchar_t *wcstring = new wchar_t[newsizea]; //多字节字符集下cstrA内存中是D6 D0 36 30 30 00,D6 D0对应1101 0110 - 1101 0000
mbstowcs_s(&convertedCharsa, wcstring, newsizea, cstrA, _TRUNCATE); //多字节字符集下wcstring内存中是D6 00 D0 00 36 00 30 00 30 00 00 00小端序
//cstrA将字面值"中"按ANSI(GB2312)编码存储为D6 00 D0 00,转成wchar_t后VS2010将宽字符以UNICODE码进行翻译显示
wchar_t a5=wcstring[0]; //214>127,VS2010按UNICODE编码显示L'Ö' 补码(原码)0000 0000 1101 0110对应00 D6
wchar_t a6=wcstring[1]; //208>127,VS2010按UNICODE编码显示L'Ð' 补码(原码)0000 0000 1101 0000对应00 D0
wchar_t b7=wcstring[2]; //54<127 ,VS2010按UNICODE编码显示L'6'
wchar_t b8=wcstring[3]; //48<127 ,VS2010按UNICODE编码显示L'0'
//wcout << wcstring << _T(" (wchar_t *)") << endl;
//ATL/MFC宽字符(wide character)串CStringW转wchar_t自然是宽字符到宽字符不变
// Convert to a wide character wchar_t* string from
// a wide character CStringW string.
//CStringW cstrW(_T("中600")); //多字节字符集下cstrW在内存中是2D 4E 36 00 30 00 30 00 00;确实是小端序(低字节在前)
//size_t lenstrw=cstrW.GetLength(); //4
wchar_t *n2stringw = new wchar_t[newsizew];
wcscpy_s( n2stringw, newsizew, cstrW ); //多字节字符集下n2stringw内存中是2D 4E 36 00 30 00 30 00 00 00小端序
//cstrW将字面值"中"按UNICODE编码存储为"2D 4E",转成wchar_t后VS2010将宽字符以UNICODE码进行翻译显示
wchar_t a7=n2stringw[0]; //20013,按UNICODE编码显示L'中' /u:4E 2D
wchar_t a8=n2stringw[1]; //54 ,按UNICODE编码显示L'6' /u:00 36
wchar_t a9=n2stringw[2]; //48 ,按UNICODE编码显示L'0' /u:00 30
//wcout << n2stringw << _T(" (wchar_t *)") << endl;