这几天帮同事看一个聊天敏感词过滤问题,需求是在ndk环境下,把用户输入的句子中不健康的词语替换成*,开始时需要先将用户输入转成wchar_t*,用了mbstowcs,在本地linux环境下测试没有问题,但是比较坑的是ndk安卓真机环境下不行,后来干脆自己写了utf8与unicode相互转换的一组函数,utf8sToUnicodes是将utf8字符串转成unicode数组,unicodesToUtf8s是将unicode数组转成utf8字符串。
\\根据多字节字符串的第一个字符返回该unicode占的字节数
int getUtf8Len(const char* utfs) {
unsigned int in = utfs[0];
in = in & 0x000000ff;
if (in >= 0x00 && in < 0xC0) return 1;
else if (in >= 0xC0 && in < 0xE0) return 2;
else if (in >= 0xE0 && in < 0xF0) return 3;
else return 0;
}
\\将多个字符组成的一个字转换成unicode
int utf8ToUnicode(const char* utf, int len) {
unsigned int rst;
if (1 == len) rst = utf[0];
else if (2 == len) {
unsigned int part1 = utf[0] & 0x1F;
part1 << 6;
unsigned int part2 = utf[1] & 0x3F;
rst = part1 | part2;
} else if (3 == len) {
unsigned int part1 = utf[0] & 0x0F;
part1 = part1 << 12;
unsigned int part2 = utf[1] & 0x3F;
part2 = part2 << 6;
unsigned int part3 = utf[2] & 0x3F;
rst = part1 | part2 | part3;
} else {
rst = 0;
}
return rst;
}
\\将Unicode转换成多字节字符
void unicodeToUtf(const unsigned int unicode, char* utf) {
if (unicode < 0x80) {
utf[0] = unicode >> 0 & 0x7F | 0x00;
utf[1] = '\0';
} else if (unicode < 0x0800) {
utf[0] = unicode >> 6 & 0x1F | 0xC0;
utf[1] = unicode >> 0 & 0x3F | 0x80;
utf[2] = '\0';
} else if (unicode < 0x010000) {
utf[0] = unicode >> 12 & 0x0F | 0xE0;
utf[1] = unicode >> 6 & 0x3F | 0x80;
utf[2] = unicode >> 0 & 0x3F | 0x80;
utf[3] = '\0';
}
}
\\将多字节字符串转换成unicode数组
int utf8sToUnicodes(const char* utfs, unsigned int * unicodes) {
int len = strlen(utfs);
int mindex = 0;
int windex = 0;
while (mindex < len) {
int bytes = getUtf8Len(utfs + mindex);
unicodes[windex] = utf8ToUnicode(utfs + mindex, bytes);
mindex += bytes;
windex ++;
}
unicodes[windex] = 0;
return windex;
}
\\将unicode数组转换成多字节字符串
void unicodesToUtf8s(const unsigned int* unicodes, char* utfs) {
sprintf(utfs, "%s", "");
int index = 0;
while(unicodes[index] != 0) {
char word[4];
unicodeToUtf(unicodes[index], word);
sprintf(utfs, "%s%s", utfs, word);
index ++;
}
}