C语言 检测一个文本文件的编码是否为utf-8

/*
    filename: isutf8.c
    Time:     2016-12-9 20:27
    Author:   Albert Wang
    email:    albertofwb@gmail.com
    Function: detect whether a text file's encoding is utf-8 format
*/

#include <stdio.h>
#include <stdlib.h>  // exit()
#include <io.h>  // _access() detect a file's existence

#define True  1
#define False 0

typedef char Bool;
typedef unsigned char Uchar;

int DumpFromFile(const char *FileName, char *buf, size_t FileSize)
{
    FILE     *fp;

    if ((fp = fopen(FileName, "rb")) == NULL)
    {
        return -1;
    }

    fread(buf, 1, FileSize, fp);
    fclose(fp);

    return 0;
}


int GetFileSize(const char *FileName, size_t *FileSize)
{
    FILE *fp;

    if ((fp = fopen(FileName, "rb")) == NULL)
    {
        return -1;
    }

    fseek(fp, 0, SEEK_END);
    *FileSize = ftell(fp);

    fclose(fp);

    return 0;
}

Bool IsUtf8(const char* FileName)
{
    FILE *fp = NULL;
    size_t FileSize = 0;
    char *fileBuf = NULL;


    GetFileSize(FileName, &FileSize);
    fileBuf = (char *)malloc(FileSize);
    DumpFromFile(FileName, fileBuf, FileSize);

    size_t i = 0;
    Bool ret = True;

    for ( ; ret && (i < FileSize); i++)
    {
        Uchar hexchar = fileBuf[i];
        // ignore ascii code
        if (!(hexchar & 0x80))
        {
            continue;
        }

        // calculate how many serial "1"
        int   BitOneCount = 0;
        Uchar num = hexchar;
        while (num & 0x80)
        {
            if (num & 0x80)
            {
                BitOneCount += 1;
            }
            num <<= 1;
        }

        BitOneCount -= 1;
        while (BitOneCount > 0)
        {
            i += 1;
            num = fileBuf[i];   // num suppose to be 10xx xxxx
            num >>= 6;            // num = 0000 0010
            if (2 != num)
            {
                ret = False;
                //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d\n", i, num, hexchar, BitOneCount);
                break;
            }
            BitOneCount -= 1;
        }

    //end for
    }


    free(fileBuf);
    return ret;
}

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        printf("Usage: %s <FileName>\n", argv[0]);
        exit(1);
    }

    const char* FileName = argv[1];
    char  *result[] = {
        "False", "True"
    };

    if (-1 == _access(FileName, 0))
    {
        printf("%s not exists!\n", FileName);
        exit(1);
    }

    printf("[%s] %s\n", FileName, result[IsUtf8(FileName)]);

    return 0;
}

/*
    参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
*/

运行结果

 

使用 winhex 以utf8 的编码查看样本文件:

文件

 

转载于:https://www.cnblogs.com/albertofwb/p/6151484.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值