Android 源码在线查看
https://cs.android.com/
现象特征
- 偶现且频率极低。
- 危害性较大,应用直接崩溃。
问题定位
- 以博文题目作为关键字搜索,有大把的分析(如:https://blog.csdn.net/liuzehn/article/details/89852076),不再赘述。
- 几乎我看到的所有文章全部指出该问题的根据在于:Java虚拟机内部的dalvik/vm/CheckJni.c中的checkUtfString函数抛出。我没有找到源码路径有 dalvik/vm/,最后在 art/runtime/jni/check_jni.cc 找到该方法调用。
- 借鉴 art/runtime/jni/check_jni.cc 中的 CheckUtfBytes 方法实现,在自己JNI层调用 NewStringUTF 之前检查字符合法性。
- 检测方法如下:
bool CheckUtfBytes(const char * bytes)
{
while (*bytes != '\0') {
const uint8_t* utf8 = reinterpret_cast<const uint8_t*>(bytes++);
// Switch on the high four bits.
switch (*utf8 >> 4) {
case 0x00:
case 0x01:
case 0x02:
case 0x03:
case 0x04:
case 0x05:
case 0x06:
case 0x07:
// Bit pattern 0xxx. No need for any extra bytes.
break;
case 0x08:
case 0x09:
case 0x0a:
case 0x0b:
// Bit patterns 10xx, which are illegal start bytes.
//*errorKind = "start";
return false;
case 0x0f:
// Bit pattern 1111, which might be the start of a 4 byte sequence.
if ((*utf8 & 0x08) == 0) {
// Bit pattern 1111 0xxx, which is the start of a 4 byte sequence.
// We consume one continuation byte here, and fall through to consume two more.
utf8 = reinterpret_cast<const uint8_t*>(bytes++);
if ((*utf8 & 0xc0) != 0x80) {
//*errorKind = "continuation";
return false;
}
}
else {
//*errorKind = "start";
return false;
}
// Fall through to the cases below to consume two more continuation bytes.
//FALLTHROUGH_INTENDED;
case 0x0e:
// Bit pattern 1110, so there are two additional bytes.
utf8 = reinterpret_cast<const uint8_t*>(bytes++);
if ((*utf8 & 0xc0) != 0x80) {
//*errorKind = "continuation";
return false;
}
// Fall through to consume one more continuation byte.
//FALLTHROUGH_INTENDED;
case 0x0c:
case 0x0d:
// Bit pattern 110x, so there is one additional byte.
utf8 = reinterpret_cast<const uint8_t*>(bytes++);
if ((*utf8 & 0xc0) != 0x80) {
//*errorKind = "continuation";
return false;
}
break;
}
}
return true;
}
- Android 源码 CheckUtfBytes 如下:
// Checks whether |bytes| is valid modified UTF-8. We also accept 4 byte UTF
// sequences in place of encoded surrogate pairs.
static const uint8_t* CheckUtfBytes(const char* bytes, const char** errorKind) {
while (*bytes != '\0') {
const uint8_t* utf8 = reinterpret_cast<const uint8_t*>(bytes++);
// Switch on the high four bits.
switch (*utf8 >> 4) {
case 0x00:
case 0x01:
case 0x02:
case 0x03:
case 0x04:
case 0x05:
case 0x06:
case 0x07:
// Bit pattern 0xxx. No need for any extra bytes.
break;
case 0x08:
case 0x09:
case 0x0a:
case 0x0b:
// Bit patterns 10xx, which are illegal start bytes.
*errorKind = "start";
return utf8;
case 0x0f:
// Bit pattern 1111, which might be the start of a 4 byte sequence.
if ((*utf8 & 0x08) == 0) {
// Bit pattern 1111 0xxx, which is the start of a 4 byte sequence.
// We consume one continuation byte here, and fall through to consume two more.
utf8 = reinterpret_cast<const uint8_t*>(bytes++);
if ((*utf8 & 0xc0) != 0x80) {
*errorKind = "continuation";
return utf8;
}
} else {
*errorKind = "start";
return utf8;
}
// Fall through to the cases below to consume two more continuation bytes.
FALLTHROUGH_INTENDED;
case 0x0e:
// Bit pattern 1110, so there are two additional bytes.
utf8 = reinterpret_cast<const uint8_t*>(bytes++);
if ((*utf8 & 0xc0) != 0x80) {
*errorKind = "continuation";
return utf8;
}
// Fall through to consume one more continuation byte.
FALLTHROUGH_INTENDED;
case 0x0c:
case 0x0d:
// Bit pattern 110x, so there is one additional byte.
utf8 = reinterpret_cast<const uint8_t*>(bytes++);
if ((*utf8 & 0xc0) != 0x80) {
*errorKind = "continuation";
return utf8;
}
break;
}
}
return nullptr;
}