关于下面的char *参数是必须为unsigned char*
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
static int isLegalUTF8( const unsigned char *source, int length ) {
unsigned char a;
const unsigned char *srcptr = source + length;
switch ( length ) {
default:
return 0;
/* Everything else falls through when "true"... */
case 4:
if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
case 3:
if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
case 2:
if ( ( a = ( *--srcptr ) ) > 0xBF ) return 0;
switch ( *source ) {
/* no fall-through in this inner switch */
case 0xE0:
if ( a < 0xA0 ) return 0;
break;
case 0xF0:
if ( a < 0x90 ) return 0;
break;
case 0xF4:
if ( a > 0x8F ) return 0;
break;
default:
if ( a < 0x80 ) return 0;
}
case 1:
if ( *source >= 0x80 && *source < 0xC2 ) return 0;
if ( *source > 0xF4 ) return 0;
}
return 1;
}
static int bson_validate_string( const unsigned char *string,
const int length ) {
int position = 0;
int sequence_length = 1;
while ( position < length ) {
sequence_length = trailingBytesForUTF8[*( string + position )] + 1;
if ( ( position + sequence_length ) > length ) {
return ERROR;
}
if ( !isLegalUTF8( string + position, sequence_length ) ) {
return ERROR;
}
position += sequence_length;
return OK;
}
uft-8编码识别
最新推荐文章于 2024-03-19 17:13:46 发布