//首先转成unicode编码, 根据编码大小可获得为那种语言,编码表请参考另一篇博客
#include <stdio.h>
#include <string>
#include <fstream>
#include <iostream>
using namespace std;
int utf82u(char *str, int * chPtr)
/* str is the UTF8 next character pointer */
/* chPtr is the int for the result */
{
int byte;
char *p;
/* HTML4.0 entities in decimal form, e.g. Å */
/* or in hexadecimal form, e.g. 水 */
byte = *((unsigned char *) str);
if (byte == '&')
{
int i, n = 0;
byte = *((unsigned char *) (str + 1));
if (byte == '#')
{
byte = *((unsigned char *) (str + 2));
if (byte == 'x' || byte == 'X')
{
for (i = 3; i < 8; i++)
{
byte = *((unsigned char*)(str + i ));
if (byte >= 'A' && byte <= 'F')
byte = byte - 'A' + 10;
else if (byte >= 'a' && byte <= 'f')
byte = byte - 'a' + 10;
else if (byte >= '0' && byte <= '9')
byte = byte - '0';
else
break;
n = (n * 16) + byte;
}
}
else
{
for (i = 2; i < 8; i++)
{
byte = *((unsigned char *) (str + i));
if (byte >= '0' && byte <= '9')
n = (n * 10) + (byte - '0');
else
break;
}
}
if (byte == ';')
{
*chPtr = (int) n;
return ++i;
}
}
else
{
/*fix me*/
*chPtr = 0;
return 1;
}
}
/*
* * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
* */
byte = *((unsigned char *) str);
if (byte < 0xC0)
{
/*
* * Handles properly formed UTF-8 characters between
* * 0x01 and 0x7F. Also treats /0 and naked trail
* * bytes 0x80 to 0xBF as valid characters representing
* * themselves.
* */
*chPtr = (int) byte;
return 1;
}
else if (byte < 0xE0)
{
if ((str[1] & 0xC0) == 0x80)
{
/*
* * Two-byte-character lead-byte followed
* * by a trail-byte.
* */
*chPtr = (int) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
return 2;
}
/*
* * A two-byte-character lead-byte not followed by trail-byte
* * represents itself.
* */
*chPtr = (int) byte;
return 1;
}
else if (byte < 0xF0)
{
if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80))
{
/*
* * Three-byte-character lead byte followed by
* * two trail bytes.
* */
*chPtr = (int) (((byte & 0x0F) << 12)
| ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
return 3;
}
/*
* * A three-byte-character lead-byte not followed by
* * two trail-bytes represents itself.
* */
*chPtr = (int) byte;
return 1;
}
*chPtr = (int) byte;
return 1;
}
void Usage(string app)
{
cout << "using " << app << " datafile" << endl;
exit(-1);
}
bool isJpan(const string& name, int& unicode)
{
utf82u(const_cast<char*>(name.c_str()), &unicode);
if (unicode >= 0x3040 && unicode <= 0x309F)
return true;
else if (unicode >= 0x30A0 && unicode <= 0x30FF)
return true;
else if (unicode >= 0x31F0 && unicode <= 0x31FF)
return true;
else
return false;
}
int main(int argc, char* argv[])
{
#if 0
char str[] = {0xe8, 0x87, 0xba, 0xe7, 0x81, 0xa3, 0x00};
char *next = str;
int uni;
int len;
int i;
for (i=0; *next; i++)
{
len = utf82u(next, &uni);
next += len;
printf("%d --0x%x/n", len, uni);
}
#endif
string app = argv[0];
if (argc < 2)
{
Usage(app);
}
ifstream inFile(argv[1]);
if (!inFile.good())
{
cout << "open file error! " << endl;
return 0;
}
int uni;
int len;
while (inFile.good())
{
std::string name;
getline(inFile, name);
// len = utf82u(const_cast<char*>(name.c_str()), &uni);
if (isJpan(name, uni))
cout << name << '/t' << uni << "/tisjpan" << endl;
else
cout << name << '/t' << uni << "/tnojpan" << endl;
}
return 0;
}