VC2005解决UNICODE字符集下CStdioFile的Writestring无法写入中文

最新推荐文章于 2023-04-25 11:46:34 发布

weixin_34112900

最新推荐文章于 2023-04-25 11:46:34 发布

阅读量167

点赞数

文章标签：操作系统 c/c++

在VC2005环境下，以下代码无法实现使用CStdioFile向文本文件中写入中文（用notepad.exe查看不到写入的中文）

CStdioFile file;

file.Open(…);

file.WriteString(_T("abc你好"));//只能写入abc

在VC2005中的解决办法：

使用setlocale语句设定区域

#include <locale>//头文件

CStdioFile file;

file.Open(…);

char* old_locale = _strdup( setlocale(LC_CTYPE,NULL) );

setlocale( LC_CTYPE, "chs" );//设定

file.WriteString(_T("abc你好"));//正常写入

setlocale( LC_CTYPE, old_locale );

free( old_locale );//还原区域设定

简化处理可以仅使用语句setlocale( LC_CTYPE, "chs" )。

setlocale:

函数原形为：char *setlocale( int category, const char *locale );

头文件：<locale.h>

所支持的操作系统为:ANSI, Win 95, Win NT

对于简体中文可以使用如下设置：setlocale( LC_ALL, "chs" );

为什么一定要调用setlocale呢？

因为在C/C++语言标准中定义了其运行时的字符集环境为"C"，也就是ASCII字符集的一个子集，那么mbstowcs在工作时会将cstr中所包含的字符串看作是ASCII编码的字符，而不认为是一个包含有chs编码的字符串，所以他会将每一个中文拆成2个ASCII编码进行转换，这样得到的结果就是会形成4个wchar_t的字符组成的串，那么如何才能够让mbstowcs正常工作呢？在调用mbstowcs进行转换之间必须明确的告诉mbstowcs目前cstr串中包含的是chs编码的字符串，通过setlocale( LC_ALL, "chs" )函数调用来完成，需要注意的是这个函数会改变整个应用程序的字符集编码方式，必须要通过重新调用setlocale( LC_ALL, "C" )函数来还原，这样就可以保证mbstowcs在转换时将cstr中的串看作是中文串，并且转换成为2个wchar_t字符，而不是4个。

本地化设置需要具备三个条件：

a. 语言代码 (Language Code)

b. 国家代码 (Country Code)

c. 编码(Encoding)

本地名字可以用下面这些部分来构造：

语言代码_国家代码.编码比如（zh_CN.UTF-8, en_US等）

locale的别名表见 /usr/lib/X11/locale/locale.alias(以Debian GNU/Linux为例)

setlocale语言字符串参考

另外还有一种方法就是重新写CStdioFile的派生类CStdioFileEx（网上有）。

//好像C++中没有类能够读些Unicode格式的文本文件，所以我写了下面这个类。用法很简单，大家尝试几下就明白了。

#pragma once

class CStdioFileEx: public CStdioFile

{

public:

CStdioFileEx();

CStdioFileEx( LPCTSTR lpszFileName, UINT nOpenFlags );

virtual BOOL Open( LPCTSTR lpszFileName, UINT nOpenFlags, CFileException* pError = NULL );

virtual BOOL ReadString(CString& rString);

BOOL ReadWideString(CStringW& rString);

BOOL ReadAnsiString(CStringA& rString);

virtual void WriteString(LPCTSTR lpsz);

void WriteWideString(LPCWSTR lpsz);

void WriteAnsiString(LPCSTR lpsz);

bool IsUnicodeFormat() {return m_bIsUnicodeText;}

unsigned long GetCharCount();

// Additional flag to allow Unicode text format writing

enum {modeWriteUnicode = 0x100000};

static bool IsFileUnicode(const CString& sFilePath);

protected:

UINT PreprocessFlags(const CString& sFilePath, UINT& nOpenFlags);

bool m_bIsUnicodeText;

};

//。cpp文件

#include "stdafx.h"

#include "StdioFileEx.h"

//在UCS 编码中有一个叫做"ZERO WIDTH NO-BREAK SPACE"的字符，它的编码是FEFF。而FFFE在UCS中是不存在的字符，

//所以不应该出现在实际传输中。UCS规范建议我们在传输字节流前，先传输字符"ZERO WIDTH NO-BREAK SPACE"。这样

//如果接收者收到FEFF，就表明这个字节流是Big-Endian的；如果收到FFFE，就表明这个字节流是Little-Endian的。

//因此字符"ZERO WIDTH NO-BREAK SPACE"又被称作BOM。

//UTF-8不需要BOM来表明字节顺序，但可以用BOM来表明编码方式。字符"ZERO WIDTH NO-BREAK SPACE"的UTF-8编码是

//EF BB BF。所以如果接收者收到以EF BB BF开头的字节流，就知道这是UTF-8编码了。

//Windows就是使用BOM来标记文本文件的编码方式的。

//有些老的浏览器和文本编辑器不支持BOM。

#define UNICODE_BOM 0xFEFF//Unicode "byte order mark" which goes at start of file

CStdioFileEx::CStdioFileEx(): CStdioFile()

{

m_bIsUnicodeText = false;

}

CStdioFileEx::CStdioFileEx(LPCTSTR lpszFileName,UINT nOpenFlags)

:CStdioFile(lpszFileName, PreprocessFlags(lpszFileName, nOpenFlags))

{

}

BOOL CStdioFileEx::Open(LPCTSTR lpszFileName,UINT nOpenFlags,CFileException* pError /*=NULL*/)

{

PreprocessFlags(lpszFileName, nOpenFlags);

return CStdioFile::Open(lpszFileName, nOpenFlags, pError);

}

BOOL CStdioFileEx::ReadString(CString& rString)

{

#ifdef _UNICODE

return ReadWideString(rString);

#else

return ReadAnsiString(rString);

#endif

}

BOOL CStdioFileEx::ReadWideString(CStringW& rString)

{

_ASSERTE(m_pStream);

rString = L""; // empty string without deallocating

if(m_bIsUnicodeText)

{

// If at position 0, discard byte-order mark before reading

if(GetPosition() == 0)

{

wchar_t bom;

Read(&bom, sizeof(wchar_t));

}

const int nMaxSize = 128;

LPWSTR lpsz = rString.GetBuffer(nMaxSize);

LPWSTR lpszResult;

int nLen = 0;

for (;;)

{

lpszResult = fgetws(lpsz, nMaxSize+1, m_pStream);

rString.ReleaseBuffer();

// handle error/eof case

if (lpszResult == NULL && !feof(m_pStream))

{

Afx_clearerr_s(m_pStream);

AfxThrowFileException(CFileException::genericException, _doserrno,

m_strFileName);

}

// if string is read completely or EOF

if (lpszResult == NULL ||

(nLen = (int)lstrlenW(lpsz)) < nMaxSize ||

lpsz[nLen-1] == '\n')

break;

nLen = rString.GetLength();

lpsz = rString.GetBuffer(nMaxSize + nLen) + nLen;

}

//remove crlf if exist.

nLen = rString.GetLength();

if (nLen > 1 && rString.Mid(nLen-2) == L"\r\n")

{

rString.GetBufferSetLength(nLen-2);

}

return rString.GetLength() > 0;

}

else

{

CStringA ansiString;

BOOL bRetval = ReadAnsiString(ansiString);

//setlocale(LC_ALL, "chs_chn.936");//no need

rString = ansiString;

return bRetval;

}

BOOL CStdioFileEx::ReadAnsiString(CStringA& rString)

{

_ASSERTE(m_pStream);

rString = ""; // empty string without deallocating

if(!m_bIsUnicodeText)

{

const int nMaxSize = 128;

LPSTR lpsz = rString.GetBuffer(nMaxSize);

LPSTR lpszResult;

int nLen = 0;

for (;;)

{

lpszResult = fgets(lpsz, nMaxSize+1, m_pStream);

rString.ReleaseBuffer();

// handle error/eof case

if (lpszResult == NULL && !feof(m_pStream))

{

Afx_clearerr_s(m_pStream);

AfxThrowFileException(CFileException::genericException, _doserrno,

m_strFileName);

}

// if string is read completely or EOF

if (lpszResult == NULL ||

(nLen = (int)lstrlenA(lpsz)) < nMaxSize ||

lpsz[nLen-1] == '\n')

break;

nLen = rString.GetLength();

lpsz = rString.GetBuffer(nMaxSize + nLen) + nLen;

}

//remove crlf if exist.

nLen = rString.GetLength();

if (nLen > 1 && rString.Mid(nLen-2) == "\r\n")

{

rString.GetBufferSetLength(nLen-2);

}

return rString.GetLength() > 0;

}

else

{

CStringW wideString;

BOOL bRetval = ReadWideString(wideString);

//setlocale(LC_ALL, "chs_chn.936");//no need

rString = wideString;

return bRetval;

}

// Purpose: Writes string to file either in Unicode or multibyte, depending on whether the caller specified the

// CStdioFileEx::modeWriteUnicode flag. Override of base class function.

void CStdioFileEx::WriteString(LPCTSTR lpsz)

{

#ifdef _UNICODE

WriteWideString(lpsz);

#else

WriteAnsiString(lpsz);

#endif

}

void CStdioFileEx::WriteWideString(LPCWSTR lpsz)

{

ASSERT(lpsz != NULL);

if (lpsz == NULL)

{

AfxThrowInvalidArgException();

}

if(m_bIsUnicodeText)

{

ASSERT(m_pStream != NULL);

// If writing Unicode and at the start of the file, need to write byte mark

if(GetPosition() == 0)

{

wchar_t cBOM = (wchar_t)UNICODE_BOM;

CFile::Write(&cBOM, sizeof(wchar_t));

}

if (fputws(lpsz, m_pStream) == _TEOF)

AfxThrowFileException(CFileException::diskFull, _doserrno, m_strFileName);

}

else

{

USES_CONVERSION;

WriteAnsiString(CW2A(lpsz));

}

void CStdioFileEx::WriteAnsiString(LPCSTR lpsz)

{

ASSERT(lpsz != NULL);

if (lpsz == NULL)

{

AfxThrowInvalidArgException();

}

if(!m_bIsUnicodeText)

{

ASSERT(m_pStream != NULL);

if (fputs(lpsz, m_pStream) == _TEOF)

AfxThrowFileException(CFileException::diskFull, _doserrno, m_strFileName);

}

else

{

USES_CONVERSION;

WriteWideString(CA2W(lpsz));

}

UINT CStdioFileEx::PreprocessFlags(const CString& sFilePath, UINT& nOpenFlags)

{

m_bIsUnicodeText = false;

// If we have writeUnicode we must have write or writeRead as well

if (nOpenFlags & CStdioFileEx::modeWriteUnicode)

{

ASSERT(nOpenFlags & CFile::modeWrite || nOpenFlags & CFile::modeReadWrite);

m_bIsUnicodeText = true;

}

// If reading in text mode and not creating...

else if (nOpenFlags & CFile::typeText && !(nOpenFlags & CFile::modeCreate) && !(nOpenFlags & CFile::modeWrite ))

{

m_bIsUnicodeText = IsFileUnicode(sFilePath);

}

//如果要读写Unicode格式的文本文件, 必须切换到typeBinary方式, 因为这会影响fputws/fgetws的工作方式(具体情况参考MSDN)。

if (m_bIsUnicodeText)

{

nOpenFlags &= ~(CFile::typeText);

nOpenFlags |= CFile::typeBinary;

}

return nOpenFlags;

}

// Purpose: Determines whether a file is Unicode by reading the first character and detecting

// whether it's the Unicode byte marker.

bool CStdioFileEx::IsFileUnicode(const CString& sFilePath)

{

CFile file;

wchar_t cFirstChar;

CFileException exFile;

bool bIsUnicode = false;

// Open file in binary mode and read first character

if (file.Open(sFilePath, CFile::typeBinary | CFile::modeRead, &exFile))

{

// If byte is Unicode byte-order marker, let's say it's Unicode

if (file.Read(&cFirstChar, sizeof(wchar_t)) > 0 && cFirstChar == (wchar_t)UNICODE_BOM)

{

bIsUnicode = true;

}

file.Close();

}

else

{

// Handle error here if you like

}

return bIsUnicode;

}

unsigned long CStdioFileEx::GetCharCount()

{

int nCharSize;

unsigned long nByteCount, nCharCount = 0;

if (m_pStream)

{

// Get size of chars in file

nCharSize = m_bIsUnicodeText ? sizeof(wchar_t): sizeof(char);

// If Unicode, remove byte order mark from count

nByteCount = (unsigned long)GetLength();

if (m_bIsUnicodeText)

{

nByteCount = nByteCount - sizeof(wchar_t);

}

// Calc chars

nCharCount = (nByteCount / nCharSize);

}

return nCharCount;

}