判断utf8编码字符是否为日文 或其它语言

//首先转成unicode编码, 根据编码大小可获得为那种语言,编码表请参考另一篇博客

 

 #include <stdio.h>
#include <string>
#include <fstream>
#include <iostream>
using namespace std;

int utf82u(char *str, int * chPtr)
    /* str is the UTF8 next character pointer */
    /* chPtr is the int for the result */
{
    int byte;
    char *p;

    /* HTML4.0 entities in decimal form, e.g. &#197; */
    /*           or in hexadecimal form, e.g. &#x6C34; */
    byte = *((unsigned char *) str);
    if (byte == '&')
    {
        int i, n = 0;

        byte = *((unsigned char *) (str + 1));
        if (byte == '#')
        {
            byte = *((unsigned char *) (str + 2));
            if (byte == 'x' || byte == 'X')
            {
                for (i = 3; i < 8; i++)
                {
                    byte = *((unsigned char*)(str + i ));
                    if (byte >= 'A' && byte <= 'F')
                        byte = byte - 'A' + 10;
                    else if (byte >= 'a' && byte <= 'f')
                        byte = byte - 'a' + 10;
                    else if (byte >= '0' && byte <= '9')
                        byte = byte - '0';
                    else
                        break;
                        n = (n * 16) + byte;
                }
            }
            else
            {
            for (i = 2; i < 8; i++)
            {
                byte = *((unsigned char *) (str + i));
                if (byte >= '0' && byte <= '9')
                    n = (n * 10) + (byte - '0');
                else
                    break;
            }
            }
            if (byte == ';')
            {
                *chPtr = (int) n;
                return ++i;
            }
        }
        else
        {
            /*fix me*/
            *chPtr = 0;
            return 1;
        }
    }

    /*
     *    * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
     *       */

    byte = *((unsigned char *) str);

    if (byte < 0xC0)
    {
        /*
         *        * Handles properly formed UTF-8 characters between
         *               * 0x01 and 0x7F.  Also treats /0 and naked trail
         *                      * bytes 0x80 to 0xBF as valid characters representing
         *                             * themselves.
         *                                    */

        *chPtr = (int) byte;
        return 1;
    }
    else if (byte < 0xE0)
    {
        if ((str[1] & 0xC0) == 0x80)
        {
            /*
             *       * Two-byte-character lead-byte followed
             *           * by a trail-byte.
             *               */

            *chPtr = (int) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
            return 2;
        }
        /*
         *        * A two-byte-character lead-byte not followed by trail-byte
         *               * represents itself.
         *                      */

        *chPtr = (int) byte;
        return 1;
    }
    else if (byte < 0xF0)
    {
        if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80))
        {
            /*
             *       * Three-byte-character lead byte followed by
             *           * two trail bytes.
             *               */

            *chPtr = (int) (((byte & 0x0F) << 12)
                    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
            return 3;
        }
        /*
         *        * A three-byte-character lead-byte not followed by
         *               * two trail-bytes represents itself.
         *                      */

        *chPtr = (int) byte;
        return 1;
    }

    *chPtr = (int) byte;
    return 1;
}

 

void Usage(string app)
{
    cout << "using " << app << " datafile" << endl;
    exit(-1);
}

 

bool isJpan(const string& name, int& unicode)
{
    utf82u(const_cast<char*>(name.c_str()), &unicode);
    if (unicode >= 0x3040 && unicode <= 0x309F)
        return true;
    else if (unicode >= 0x30A0 && unicode <= 0x30FF)
        return true;
    else if (unicode >= 0x31F0 && unicode <= 0x31FF)
        return true;
    else
        return false;
}

int main(int argc, char* argv[])
{
#if 0
    char str[] = {0xe8, 0x87, 0xba, 0xe7, 0x81, 0xa3, 0x00};
    char *next = str;
    int uni;
    int len;
    int i;
   
    for (i=0; *next; i++)
    {
        len = utf82u(next, &uni);
        next += len;
        printf("%d --0x%x/n", len, uni);

    }
#endif
    string app = argv[0];
    if (argc < 2)
    {
        Usage(app);
    }
   
    ifstream inFile(argv[1]);
    if (!inFile.good())
    {
        cout << "open file error! " << endl;
        return 0;
    }
   
    int uni;
    int len;
    while (inFile.good())
    {
        std::string name;
        getline(inFile, name);
//        len = utf82u(const_cast<char*>(name.c_str()), &uni);
        if (isJpan(name, uni))
            cout << name << '/t' << uni << "/tisjpan" << endl;
        else
            cout << name << '/t' << uni << "/tnojpan" << endl;
    }      
  
    return 0;
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值