UTF8/ANSI/UNICODE文件读取

//判断文件类型
DWORD CHXScriptReal::CheckFileType( HANDLE hFile )
{
    HXScriptFileCheck sfc;
    DWORD dwcbSize;


    if( ReadFile( hFile, &sfc, sizeof( sfc ), &dwcbSize, NULL ) && ( dwcbSize > sizeof( sfc )))
    {
        if( IsBinFile( &sfc ))
            return HXSF_CODETYPE_BIN;
    }


    if( dwcbSize >= 3 && sfc.byBom[ 0 ] == 0xEF && sfc.byBom[ 1 ] == 0xBB && sfc.byBom[ 2 ] == 0xBF )
    {
        SetFilePointer( hFile, 3, NULL, FILE_BEGIN );
        return HXSF_CODETYPE_UTF8;
    }
    else if( dwcbSize >= 2 && sfc.byBom[ 0 ] == 0xFF && sfc.byBom[ 1 ] == 0xFE )
    {
        SetFilePointer( hFile, 2, NULL, FILE_BEGIN );
        return HXSF_CODETYPE_UNICODE;
    }
    else if( dwcbSize >= 2 && sfc.byBom[ 0 ] == 0xFE && sfc.byBom[ 1 ] == 0xFF )
    {
        SetFilePointer( hFile, 2, NULL, FILE_BEGIN );
        return HXSF_CODETYPE_UNICODE_BIGENDIAN;
    }
    else
    {
        SetFilePointer( hFile, 0, NULL, FILE_BEGIN );
        return HXSF_CODETYPE_ANSI;
    }
}
//读取一个字符
WCHAR CHXLexer::ReadNextCharFromFile()
{
    char  btChar;
    WCHAR ch;
    DWORD dwReaded;

    assert( m_hFile != NULL && m_hFile != INVALID_HANDLE_VALUE );

    switch( m_dwCodeType )
    {
    case HXSF_CODETYPE_UNICODE:
        if(( ! ReadFile( m_hFile, &ch, sizeof( WCHAR ), &dwReaded, NULL )) || ( dwReaded != sizeof( WCHAR )))
            ch = 0;
        break;

    case HXSF_CODETYPE_UNICODE_BIGENDIAN:
        if( ReadFile( m_hFile, &ch, sizeof( WCHAR ), &dwReaded, NULL ) && ( dwReaded == sizeof( WCHAR )))
        {
            WCHAR chTmp = ch;
            ch = chTmp << 8;
            ch |= ( chTmp >> 8 );
        }
        else
            ch = 0;
        break;

    case HXSF_CODETYPE_UTF8:
        if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 ))
        {
            char szch[ 8 ];
            WCHAR szwch[ 2 ];
            int n, i;

            if(( btChar & 0x80 ) == 0x00 )
                n = 1;
            else if(( btChar & 0xE0 ) == 0xC0 )
                n = 2;
            else if(( btChar & 0xF0 ) == 0xE0 )
                n = 3;
            else if(( btChar & 0xF8 ) == 0xF0 )
                n = 4;
            else if(( btChar & 0xFC ) == 0xF8 )
                n = 5;
            else if(( btChar & 0xFE ) == 0xFC )
                n = 6;
            else
                n = 0;
            
            szch[ 0 ] = btChar;
            for( i = 1; i < n; ++ i )
            {
                if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 ))
                    szch[ i ] = btChar;
                else
                    break;
            }
            szch[ i ] = 0;

            if( ::MultiByteToWideChar( CP_UTF8, 0, szch, i, szwch, 2 ) != 0 )
                ch = *szwch;
            else
                ch = 0;
        }
        else
            ch = 0;
        break;

    case HXSF_CODETYPE_ANSI:
        if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 ))
        {
            char szch[ 4 ];
            WCHAR szwch[ 2 ];
            int n = 1;

            szch[ 0 ] = btChar;
            if(( btChar & 0x80 ) == 0x80 )
            {
                if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 ))
                {
                    szch[ 1 ] = btChar;
                    szch[ 2 ] = 0;
                    ++n;
                }
                else
                    szch[ 1 ] = 0;
            }
            else
                szch[ 1 ] = 0;

            if( ::MultiByteToWideChar( CP_ACP, 0, szch, n, szwch, 2 ) == 1 )
                ch =  *szwch;
            else
                ch = 0;
        }
        else
            ch = 0;
        break;
    }
    return ch;
}
//判断一个缓冲区是否为UTF8编码
BOOL CHXScriptReal::IsTextUTF8( BYTE * pszBuffer, int ncb )
{
    int i = 0;
    while( i < ncb )
    {
        int step = 0;
        if(( pszBuffer[ i ] & 0x80) == 0x00 )
        {
            step = 1;
        }
        else if(( pszBuffer[ i ] & 0xe0 ) == 0xc0 )
        {
            if( i + 1 >= ncb )
                return FALSE;
            if(( pszBuffer[ i + 1 ] & 0xc0 ) != 0x80 )
                return FALSE;
            step = 2;
        }
        else if(( pszBuffer[ i ] & 0xf0 ) == 0xe0 )
        {
            if( i + 2 >= ncb )
                return FALSE;
            if(( pszBuffer[ i + 1 ] & 0xc0 ) != 0x80 )
                return FALSE;
            if(( pszBuffer[ i + 2 ] & 0xc0 ) != 0x80 )
                return FALSE;
            step = 3;
        }
        else
            return FALSE;
        i += step;
    }
    if( i == ncb )
        return TRUE;


    return FALSE;
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

示申○言舌

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值