判断utf8编码字符是否为日文或其它语言

最新推荐文章于 2021-08-27 16:10:29 发布

coder马冬冬

最新推荐文章于 2021-08-27 16:10:29 发布

阅读量6.4k

点赞数

分类专栏： C/C++ 文章标签：语言 byte string iostream character html

C/C++ 专栏收录该内容

13 篇文章 1 订阅

订阅专栏

//首先转成unicode编码, 根据编码大小可获得为那种语言，编码表请参考另一篇博客

#include <stdio.h>
#include <string>
#include <fstream>
#include <iostream>
using namespace std;

int utf82u(char *str, int * chPtr)
    /* str is the UTF8 next character pointer */
    /* chPtr is the int for the result */
{
    int byte;
    char *p;

    /* HTML4.0 entities in decimal form, e.g. Å */
    /*           or in hexadecimal form, e.g. 水 */
    byte = *((unsigned char *) str);
    if (byte == '&')
    {
        int i, n = 0;

        byte = *((unsigned char *) (str + 1));
        if (byte == '#')
        {
            byte = *((unsigned char *) (str + 2));
            if (byte == 'x' || byte == 'X')
            {
                for (i = 3; i < 8; i++)
                {
                    byte = *((unsigned char*)(str + i ));
                    if (byte >= 'A' && byte <= 'F')
                        byte = byte - 'A' + 10;
                    else if (byte >= 'a' && byte <= 'f')
                        byte = byte - 'a' + 10;
                    else if (byte >= '0' && byte <= '9')
                        byte = byte - '0';
                    else
                        break;
                        n = (n * 16) + byte;
                }
            }
            else
            {
            for (i = 2; i < 8; i++)
            {
                byte = *((unsigned char *) (str + i));
                if (byte >= '0' && byte <= '9')
                    n = (n * 10) + (byte - '0');
                else
                    break;
            }
            }
            if (byte == ';')
            {
                *chPtr = (int) n;
                return ++i;
            }
        }
        else
        {
            /*fix me*/
            *chPtr = 0;
            return 1;
        }
    }

    /*
     *    * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
     *       */

byte = *((unsigned char *) str);

    if (byte < 0xC0)
    {
        /*
         *        * Handles properly formed UTF-8 characters between
         *               * 0x01 and 0x7F. Also treats /0 and naked trail
         *                      * bytes 0x80 to 0xBF as valid characters representing
         *                             * themselves.
         *                                    */

        *chPtr = (int) byte;
        return 1;
    }
    else if (byte < 0xE0)
    {
        if ((str[1] & 0xC0) == 0x80)
        {
            /*
             *       * Two-byte-character lead-byte followed
             *           * by a trail-byte.
             *               */

            *chPtr = (int) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
            return 2;
        }
        /*
         *        * A two-byte-character lead-byte not followed by trail-byte
         *               * represents itself.
         *                      */

        *chPtr = (int) byte;
        return 1;
    }
    else if (byte < 0xF0)
    {
        if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80))
        {
            /*
             *       * Three-byte-character lead byte followed by
             *           * two trail bytes.
             *               */

            *chPtr = (int) (((byte & 0x0F) << 12)
                    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
            return 3;
        }
        /*
         *        * A three-byte-character lead-byte not followed by
         *               * two trail-bytes represents itself.
         *                      */

        *chPtr = (int) byte;
        return 1;
    }

*chPtr = (int) byte;
return 1;
}

void Usage(string app)
{
cout << "using " << app << " datafile" << endl;
exit(-1);
}

bool isJpan(const string& name, int& unicode)
{
    utf82u(const_cast<char*>(name.c_str()), &unicode);
    if (unicode >= 0x3040 && unicode <= 0x309F)
        return true;
    else if (unicode >= 0x30A0 && unicode <= 0x30FF)
        return true;
    else if (unicode >= 0x31F0 && unicode <= 0x31FF)
        return true;
    else
        return false;
}

int main(int argc, char* argv[])
{
#if 0
    char str[] = {0xe8, 0x87, 0xba, 0xe7, 0x81, 0xa3, 0x00};
    char *next = str;
    int uni;
    int len;
    int i;

    for (i=0; *next; i++)
    {
        len = utf82u(next, &uni);
        next += len;
        printf("%d --0x%x/n", len, uni);

    }
#endif
    string app = argv[0];
    if (argc < 2)
    {
        Usage(app);
    }

    ifstream inFile(argv[1]);
    if (!inFile.good())
    {
        cout << "open file error! " << endl;
        return 0;
    }

    int uni;
    int len;
    while (inFile.good())
    {
        std::string name;
        getline(inFile, name);
//        len = utf82u(const_cast<char*>(name.c_str()), &uni);
        if (isJpan(name, uni))
            cout << name << '/t' << uni << "/tisjpan" << endl;
        else
            cout << name << '/t' << uni << "/tnojpan" << endl;
    }

    return 0;
}