读 Unicode 格式的文本文件参考资料

最新推荐文章于 2024-10-14 17:54:02 发布

好名字_storm

最新推荐文章于 2024-10-14 17:54:02 发布

阅读量73

点赞数

分类专栏： # 文件操作文章标签： c++ mfc 经验分享

本文链接：https://blog.csdn.net/m0_63625953/article/details/133898011

版权

文件操作专栏收录该内容

3 篇文章 0 订阅

订阅专栏

// 说明:
// This function maps a character string to a wide-character (Unicode) string
//
// 参数:
// lpcszStr: [in] Pointer to the character string to be converted
// lpwszStr: [out] Pointer to a buffer that receives the translated string.
// dwSize: [in] Size of the buffer
//
// 返回值: TRUE: Succeed FALSE: Failed
// 例子: MByteToWChar(szA,szW,sizeof(szW)/sizeof(szW[0]));
//
bool CLyjClass::MByteToWChar(LPCSTR lpcszStr, LPWSTR lpwszStr, DWORD dwSize)
{
// Get the required size of the buffer that receives the Unicode
// string.
DWORD dwMinSize;
dwMinSize = MultiByteToWideChar (CP_ACP, 0, lpcszStr, -1, NULL, 0);

if(dwSize < dwMinSize)
{
return FALSE;
}

// Convert headers from ASCII to Unicode.
MultiByteToWideChar (CP_ACP, 0, lpcszStr, -1, lpwszStr, dwMinSize);
return TRUE;
}

///
// 说明:
// This function maps a wide-character string to a new character string
// 参数:
// lpcwszStr: [in] Pointer to the character string to be converted
// lpszStr: [out] Pointer to a buffer that receives the translated string.
// dwSize: [in] Size of the buffer
// 返回值: TRUE: Succeed FALSE: Failed
// 例子: MByteToWChar(szW,szA,sizeof(szA)/sizeof(szA[0]));
//
bool CLyjClass::WCharToMByte(LPCWSTR lpcwszStr, LPSTR lpszStr, DWORD dwSize)
{
DWORD dwMinSize;
dwMinSize = WideCharToMultiByte(CP_OEMCP,NULL,lpcwszStr,-1,NULL,0,NULL,FALSE);
if(dwSize < dwMinSize)
{
return FALSE;
}
WideCharToMultiByte(CP_OEMCP,NULL,lpcwszStr,-1,lpszStr,dwSize,NULL,FALSE);
return TRUE;
}

///
// unicode 编码转成CString
// 返回值：转换成功 TRUE，否则FALSE
//
bool CLyjClass::UnicodeToCString(unsigned short unicode, CString &str)
{
TCHAR tch[1];
char ch[3];
ZeroMemory(tch, _tcslen(tch)* sizeof(TCHAR));
ZeroMemory(ch, sizeof(ch));

tch[0] = unicode;
if(WCharToMByte(tch, ch, sizeof(ch)) )
{
str = ch;
return TRUE;
}
else
return FALSE;
}


int CPrj_ReadUnicodeDlg::ReadTxtFile(const CString &strFileName)
{
CString strUnicode;
CStringArray strUnicodeArray;
CLyjClass myClass;
DWORD dwFileLength;
TCHAR *pch= NULL;

// 读入文本
CFile file;
if (!file.Open(strFileName, CFile::modeRead))
{
return FALSE;
}

dwFileLength = file.GetLength();
pch = new TCHAR[dwFileLength];
ZeroMemory(pch, dwFileLength);
file.Read(pch, dwFileLength);
file.Close();

// 是unicode文本吗?
if (pch[0] != 0xFEFF)
{
MsgBox(_T("读入的文本不是Unicode编码, 本系统暂不提供支持"));
return FALSE;
}

// 读出第一行
GetAllLineTxt(strUnicodeArray, pch, dwFileLength/2);
GetAllStrUnicode(strUnicodeArray);
CWordArray unicodeArray;
ConvertAllUnicode(unicodeArray, strUnicodeArray);

// 控件显示每一行
m_ctlList.ResetContent();
for (int j = 0; j < strUnicodeArray.GetSize(); j++)
{
m_ctlList.InsertString(j, strUnicodeArray.GetAt(j));
}

if (pch != NULL)
delete [] pch;

return TRUE;
}

void CPrj_ReadUnicodeDlg::MsgBox(const CString &strMsg)
{
MessageBox(strMsg, _T("提示信息"), MB_OK | MB_ICONINFORMATION);
}

void CPrj_ReadUnicodeDlg::OnBtnConvert()
{
// TODO: Add your control notification handler code here
UpdateData(TRUE);
if (m_strFileName.IsEmpty())
{
MsgBox(_T("文件路径不存在!") + m_strFileName);
return;
}
if ( !ReadTxtFile(m_strFileName))
{
MsgBox( _T("文件打开有误! \n") + m_strFileName );
}
}

/
// 得到每一行文本
// 参数: pch: 指向unicode字符串, iSize: 字符串个数
//
bool CPrj_ReadUnicodeDlg::GetAllLineTxt(CStringArray &strUnicodeArray, const TCHAR *pch, int iSize)
{

CString strLine;

// 分离每一行文本
for (int i = 1; i < iSize; i++) // 跳过FFFE, 令i=1
{
// 得到一行 0d 0a为 unocide换行标记
while ( (i < iSize) && (pch[i] != 0x000d) && (pch[i] != 0x000a))
{
strLine += pch[i];
i++;
}
strLine.TrimLeft();
strLine.TrimRight();
strLine.MakeLower();

if (strLine.GetLength() >= 1)
{
strUnicodeArray.Add(strLine);
}
strLine.Empty();
}
return TRUE;
}

/
// 得到启始位置的文档
// 实现: 通过空格切分
// 参数:
//
bool CPrj_ReadUnicodeDlg::GetAllStrUnicode(CStringArray &strUnicodeArray)
{
CString str;
CString strTemp;
for (int i = 0; i < strUnicodeArray.GetSize(); i++)
{
str = strUnicodeArray.GetAt(i);
int pos = str.Find(' ');

if (pos != -1)
strTemp = str.Left(pos);
else
strTemp = str;

strUnicodeArray.SetAt(i, strTemp); // 修改
TRACE(_T("内容: %s"), strTemp);
}
return TRUE;
}

/
// CStringArray 转换为CWordArray
// 字符串转 unicode编码
// 实现:
// 参数:
//
bool CPrj_ReadUnicodeDlg::ConvertAllUnicode(CWordArray &unicodeArray, CStringArray &strUnicodeArray)
{

CString str;
WORD wUnicode;
CLyjClass myClass;

for (int i = 0; i < strUnicodeArray.GetSize(); i++ )
{
str = strUnicodeArray.GetAt(i);
if (myClass.StringToUnicode(str, wUnicode))
{
unicodeArray.Add(wUnicode);
}
}

return TRUE;
}

CStdioFile file;
    if (!file.Open(m_File_Path, CFile::modeRead)) return;
    CString strLine;
    while (file.ReadString(strLine))    
    { 
       //strLine处理
    }
问题：
    CStdioFile在_MSBC环境下读取任何ANSI文本数据都没问题，在UNICODE环境下读取ANSI文本中的中文时就会显示乱码。
原因：
CStdioFile读取ANSI文本数据时按char类型读取，在_MSBC下可以直接填充到CString，在UNICODE环境下要先将char转换成宽字符WCHAR，然后再填充到CString，即一个汉字的两个char将变成两个UNICODE字符WCHAR。
解决办法：
    在UNICODE环境下file.ReadString(strLine)取得的数据实际上是char类型，但是存储在UNICODE字符串中。为了取得真实数据，必须对strLine进行处理。
void function(CString &str)
{
    char *szBuf = new char[str.GetLength()];
    for (int i = 0 ; i < str.GetLength(); i++)
    {
        szBuf[i] = str.GetAt(i);
    }
    CharToUnicode(szBuf , &str);
    delete []szBuf;
}
    注：此函数在编译的时候会提示
                warning C4244: '=' : conversion from 'unsigned short' to 'char', possible loss of data
            不用管它，丢失的数据是我们不需要的。
===================================================================================
/

// 将Char型字符转换为Unicode字符
int CharToUnicode(char *pchIn, CString *pstrOut)
{
    int nLen;
    WCHAR *ptch;
    if(pchIn == NULL)
    {
        return 0;
    }
    nLen = MultiByteToWideChar(CP_ACP, 0, pchIn, -1, NULL, 0);
    ptch = new WCHAR[nLen];
    MultiByteToWideChar(CP_ACP, 0, pchIn, -1, ptch, nLen);
    pstrOut->Format(_T("%s"), ptch);
    delete [] ptch;
    return nLen;
}

/

// 将Unicode字符转换为Char型字符
int UnicodeToChar(CString &strIn, char *pchOut, int nCharLen)
{
    if(pchOut == NULL)
    {
        return 0;
    }
    int nLen = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)strIn.GetBuffer(BUFFER_SIZE_KILO),-1, NULL, 0, NULL, NULL);
    nLen = min(nLen, nCharLen);
    WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)strIn.GetBuffer(BUFFER_SIZE_KILO), -1, pchOut,nLen, NULL, NULL);
    if(nLen < nCharLen)
    {
        pchOut[nLen] = 0;
    }
    return nLen;
}