unsigned utf8decode(const char* p, const char* end, int* len)
{
unsigned char c = *(unsigned char*)p;
if (c < 0x80) {
*len = 1;
return c;
} else if (c < 0xc2) {
goto FAIL;
}
if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
if (c < 0xe0) {
*len = 2;
return
((p[0] & 0x1f) << 6) +
((p[1] & 0x3f));
} else if (c == 0xe0) {
if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
goto UTF8_3;
}
#if STRICT_RFC3629
else if (c == 0xed) {
// RFC 3629 says surrogate chars are illegal.
if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
goto UTF8_3;
} else if (c == 0xef) {
// 0xfffe and 0xffff are also illegal characters
if (((unsigned char*)p)[1]==0xbf &&
((unsigned char*)p)[2]>=0xbe) goto FAIL;
goto UTF8_3;
}
#endif
else if (c < 0xf0) {
UTF8_3:
if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
*len = 3;
return
((p[0] & 0x0f) << 12) +
((p[1] & 0x3f) << 6) +
((p[2] & 0x3f));
} else if (c == 0xf0) {
if (((unsigned char*)p)[1] < 0x90) goto FAIL;
goto UTF8_4;
} else if (c < 0xf4) {
UTF8_4:
if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
*len = 4;
#if STRICT_RFC3629
// RFC 3629 says all codes ending in fffe or ffff are illegal:
if ((p[1]&0xf)==0xf &&
((unsigned char*)p)[2] == 0xbf &&
((unsigned char*)p)[3] >= 0xbe) goto FAIL;
#endif
return
((p[0] & 0x07) << 18) +
((p[1] & 0x3f) << 12) +
((p[2] & 0x3f) << 6) +
((p[3] & 0x3f));
} else if (c == 0xf4) {
if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
goto UTF8_4;
} else {
FAIL:
*len = 1;
return 0xfffd; // Unicode REPLACEMENT CHARACTER
}
}
bool Is_wchar(const char* src )
{
int srclen= (int)strlen(src );
int ret = 1;
const char* p = src ;
const char* e = src + srclen;
while( p < e )
{
if( *p == 0 )
return false;
if( *p & 0x80 )
{
int len = 0;
utf8decode(p, e, &len);
if( len < 2 ) return 0;
if( len > ret ) ret = len;
p += len;
}
else
p++;
}
return ret != 0 ? true : false;
}