#include <stdio.h>
#include <stdlib.h>
// 检测UTF-8编码是否合法
int is_valid_utf8(unsigned char* text, int size) {
int i = 0;
while (i < size) {
if ((text[i] & 0x80) == 0x00) {
// 单字节字符
i += 1;
} else if ((text[i] & 0xE0) == 0xC0) {
// 2字节字符
if (i + 1 >= size || (text[i + 1] & 0xC0) != 0x80)
return 0;
i += 2;
} else if ((text[i] & 0xF0) == 0xE0) {
// 3字节字符
if (i + 2 >= size || (text[i + 1] & 0xC0) != 0x80 || (text[i + 2] & 0xC0) != 0x80)
return 0;
i += 3;
} else if ((text[i] & 0xF8) == 0xF0) {
// 4字节字符
if (i + 3 >= size || (text[i + 1] & 0xC0) != 0x80 || (text[i + 2] & 0xC0) != 0x80 || (text[i + 3] & 0xC0) != 0x80)
return 0;
i += 4;
} else {
return 0;
}
}
return 1;
}
// UTF-8转UTF-16
unsigned short* utf8_to_utf16(unsigned char* text, int size, int* utf16_size) {
if (!is_valid_utf8(text, size)) {
printf("Error: Invalid UTF-8 encoding.\n");
return NULL;
}
*utf16_size = 0;
int i = 0;
while (i < size) {
if ((text[i] & 0x80) == 0x00) {
// 单字节字符直接转换
(*utf16_size)++;
i += 1;
} else if ((text[i] & 0xE0) == 0xC0) {
// 2字节字符
if (i + 1 >= size || (text[i + 1] & 0xC0) != 0x80)
break;
(*utf16_size)++;
i += 2;
} else if ((text[i] & 0xF0) == 0xE0) {
// 3字节字符
if (i + 2 >= size || (text[i + 1] & 0xC0) != 0x80 || (text[i + 2] & 0xC0) != 0x80)
break;
(*utf16_size)++;
i += 3;
} else if ((text[i] & 0xF8) == 0xF0) {
// 4字节字符
if (i + 3 >= size || (text[i + 1] & 0xC0) != 0x80 || (text[i + 2] & 0xC0) != 0x80 || (text[i + 3] & 0xC0) != 0x80)
break;
(*utf16_size)++;
i += 4;
} else {
break;
}
}
if (i < size) {
printf("Error: Invalid UTF-8 encoding.\n");
return NULL;
}
unsigned short* utf16_text = (unsigned short*)malloc((*utf16_size) * sizeof(unsigned short));
if (utf16_text == NULL) {
printf("Error: Memory allocation failed.\n");
return NULL;
}
int utf16_index = 0;
i = 0;
while (i < size) {
if ((text[i] & 0x80) == 0x00) {
// 单字节字符直接转换
utf16_text[utf16_index++] = text[i];
i += 1;
} else if ((text[i] & 0xE0) == 0xC0) {
// 2字节字符
unsigned short unicode = ((text[i] & 0x1F) << 6) | (text[i + 1] & 0x3F);
utf16_text[utf16_index++] = unicode;
i += 2;
} else if ((text[i] & 0xF0) == 0xE0) {
// 3字节字符
unsigned short unicode = ((text[i] & 0x0F) << 12) | ((text[i + 1] & 0x3F) << 6) | (text[i + 2] & 0x3F);
utf16_text[utf16_index++] = unicode;
i += 3;
} else if ((text[i] & 0xF8) == 0xF0) {
// 4字节字符(忽略)
i += 4;
} else {
// 非法字符(忽略)
i++;
}
}
return utf16_text;
}
// 打印UTF-16文本
void print_utf16_text(unsigned short* utf16_text, int size) {
for (int i = 0; i < size; i++) {
printf("%04X", utf16_text[i]);
if (i != size - 1)
printf(" ");
}
printf("\n");
}
int main() {
unsigned char utf8_text[] = { 0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE6, 0xB1, 0x89, 0xE5, 0x86, 0x99, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, 0x21 };
int utf8_size = sizeof(utf8_text);
int utf16_size = 0;
unsigned short* utf16_text = utf8_to_utf16(utf8_text, utf8_size, &utf16_size);
if (utf16_text != NULL) {
printf("UTF-8 text: %s\n", utf8_text);
printf("UTF-16 text: ");
print_utf16_text(utf16_text, utf16_size);
free(utf16_text);
}
return 0;
}
c语言实现utf-8转utf-16,函数内使用malloc申请内存,同时检测合法性,输出错误
于 2023-07-08 18:05:58 首次发布