{
assert(str != NULL);
int len = strlen (str);
int counter = 0;
char head = 0x80;
char firstChar, secondChar;
int i = 0;
for (i = 0; i < len - 1; ++i)
{
firstChar = str[i];
if (!(firstChar & head))
continue;
secondChar = str[i];
if (firstChar >= 0xA1 && firstChar <= 0XF7 && secondChar>=0xA1 && secondChar <= 0XFE)
{
counter+= 2;
++i;
}
}
return counter;
}
/*************************************************************/
// 1字节 0xxxxxxx
// 2字节 110xxxxx 10xxxxxx
// 3字节 1110xxxx 10xxxxxx 10xxxxxx
// 4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
/*************************************************************/
int str_is_utf8(const char *str,int length)
{
int i;
int nBytes=0;//UFT8可用1-6个字节编码,ASCII用一个字节
unsigned char chr;
int bAllAscii=1; //如果全部都是ASCII, 说明不是UTF-8
for(i=0;i<length;i++)
{
chr= *(str+i);
if( (chr&0x80) != 0 ) // 判断是否ASCII编码,如果不是,说明有可能是UTF-8,ASCII用7位编码,但用一个字节存,最高位标记为0,o0xxxxxxx
bAllAscii= 0;
if(nBytes==0) //如果不是ASCII码,应该是多字节符,计算字节数
{
if(chr>=0x80)
{
if(chr>=0xFC&&chr<=0xFD)
nBytes=6;
else if(chr>=0xF8)
nBytes=5;
else if(chr>=0xF0)
nBytes=4;
else if(chr>=0xE0)
nBytes=3;
else if(chr>=0xC0)
nBytes=2;
else
{
return 0;
}
nBytes--;
}
}
else //多字节符的非首字节,应为 10xxxxxx
{
if( (chr&0xC0) != 0x80 )
{
return 0;
}
nBytes--;
}
}
if( nBytes > 0 ) //违返规则
{
return 0;
}
if( bAllAscii ) //如果全部都是ASCII, 说明不是UTF-8
{
return 0;
}
return 1;
}
typedef unsigned char u_int8_t;
int isutf82(char *s, size_t ns)
{
uint8_t x = 0, i = 0, j = 0, nbytes = 0, n = 0;
for(i = 1; i < 7; i++)
{
x = (uint8_t)(255 << i);
if(((uint8_t)*s & x) == x)
{
n = nbytes = (8 - i);
for(j = 0; (j < nbytes && j < ns); j++)
{
if((uint8_t)s[j] <= 0x80 && (uint8_t)s[j] >= 0xc0)break;
else n--;
}
if(n == 0) return nbytes;
}
}
return 0;
}
int isUtf81(const char *buf)
{
int i, n;
register unsigned char c;
int gotone = 0;
#define F 0 /* character never appears in text */
#define T 1 /* character appears in plain ASCII text */
#define I 2 /* character appears in ISO-8859 text */
#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
static const unsigned char text_chars[256] = {
/* BEL BS HT LF FF CR */
F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
/* ESC */
F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
/* NEL */
X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
};
/* *ulen = 0; */
for (i = 0; (c = buf[i ]); i++) {
if ((c & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
/*
* Even if the whole file is valid UTF-8 sequences,
* still reject it if it uses weird control characters.
*/
if (text_chars[c] != T)
return 0;
} else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
return 0;
} else { /* 11xxxxxx begins UTF-8 */
int following;
if ((c & 0x20) == 0) { /* 110xxxxx */
following = 1;
} else if ((c & 0x10) == 0) { /* 1110xxxx */
following = 2;
} else if ((c & 0x08) == 0) { /* 11110xxx */
following = 3;
} else if ((c & 0x04) == 0) { /* 111110xx */
following = 4;
} else if ((c & 0x02) == 0) { /* 1111110x */
following = 5;
} else
return 0;
for (n = 0; n < following; n++) {
i++;
if (!(c = buf[i ]))
goto done;
if ((c & 0x80) == 0 || (c & 0x40))
return 0;
}
gotone = 1;
}
}
done:
return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
}
int count_UTF8(const char * str)
{
assert(str != NULL);
int len = strlen (str);
int counter = 0;
char head = 0x80;
char firstChar;
int i = 0;
for (i = 0; i < len; ++i)
{
firstChar = str[i];
if (!(firstChar & head))
continue;
char tmpHead = head;
int wordLen = 0 , tPos = 0;
while (firstChar & tmpHead)
{
++ wordLen;
tmpHead >>= 1;
}
if (wordLen <= 1)continue; //utf8最小长度为2
wordLen --;
if (wordLen + i >= len)
break;
for (tPos = 1; tPos <= wordLen; ++tPos)
{
char secondChar = str[i + tPos];
if (!(secondChar & head))
break;
}
if (tPos > wordLen)
{
counter += wordLen + 1;
i += wordLen;
}
}
return counter;
}
#ifndef ICONV_CONST
#define ICONV_CONST const
#endif
int
convert(const char *from, const char *to, char* save, int savelen, char *src, int srclen)
{
iconv_t cd;
char *inbuf = src;
char *outbuf = save;
size_t outbufsize = savelen;
int status = 0;
size_t savesize = 0;
size_t inbufsize = srclen;
const char* inptr = inbuf;
size_t insize = inbufsize;
char* outptr = outbuf;
size_t outsize = outbufsize;
cd = iconv_open(to, from);
iconv(cd,NULL,NULL,NULL,NULL);
if (inbufsize == 0) {
status = -1;
goto done;
}
while (insize > 0) {
size_t res = iconv(cd,(ICONV_CONST char**)&inptr,&insize,&outptr,&outsize);
if (outptr != outbuf) {
int saved_errno = errno;
int outsize = outptr - outbuf;
strncpy(save+savesize, outbuf, outsize);
errno = saved_errno;
}
if (res == (size_t)(-1)) {
if (errno == EILSEQ) {
printf("errno:EILSEQ(InBuf Multi byte order is invalid)\r\n");
int one = 1;
//iconvctl(cd,ICONV_SET_DISCARD_ILSEQ,&one);
status = -3;
} else if (errno == EINVAL) {
printf("errno:EINVAL(No residual byte conversion)\r\n");
if (inbufsize == 0) {
status = -4;
goto done;
} else {
break;
}
} else if (errno == E2BIG) {
printf("errno:E2BGI(The OutBuf space is not enough)\r\n");
status = -5;
goto done;
} else {
status = -6;
goto done;
}
}
}
status = strlen(save);
done:
iconv_close(cd);
return status;
}