UTF-8与Unicode互相转换
根据如下表格
unicode | utf-8 |
---|---|
U+0000 - U+007F | 0xxxxxxx (1个字节) |
U+0080 - U+07FF | 110xxxxx 10xxxxxx (2个字节) |
U+0800 - U+FFFF | 1110xxxx 10xxxxxx 10xxxxxx (3个字节) |
U+10000 - U+10FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4个字节) |
编写如下C语言代码
typedef unsigned uchar;
typedef struct {
uchar* buffer;
unsigned long long length;
unsigned long long size;
} ustring;
// utf-8转unicode
ustring getFromBytes(char* bytes, void errorHandler(char* errorMsg)) {
ustring string;
string.buffer = (uchar*)malloc(sizeof(uchar) * INIT_STRING_SIZE);
string.size = INIT_STRING_SIZE;
string.length = 0;
size_t bytesSize = strlen(bytes);
for (size_t ptr = 0; ptr < bytesSize; ptr++) {
uchar ch = 0;
unsigned char byte = bytes[ptr];
if ((byte & 0xf0) == 0xf0) {
if (ptr + 3 >= bytesSize) {
free(string.buffer);
string.buffer = NULL;
errorHandler(UTF8_ERROR_MSG);
}
ch = ((byte & 7) << 18) + ((bytes[ptr + 1] & 0x3f) << 12) + ((bytes[ptr + 2] & 0x3f) << 6) + (bytes[ptr + 3] & 0x3f);
ptr += 3;
} else if ((byte & 0xe0) == 0xe0) {
if (ptr + 2 >= bytesSize) {
free(string.buffer);
string.buffer = NULL;
errorHandler(UTF8_ERROR_MSG);
}
ch = ((byte & 0xf) << 12) + ((bytes[ptr + 1] & 0x3f) << 6) + (bytes[ptr + 2] & 0x3f);
ptr += 2;
} else if ((byte & 0xc0) == 0xc0) {
if (ptr + 1 >= bytesSize) {
free(string.buffer);
string.buffer = NULL;
errorHandler(UTF8_ERROR_MSG);
}
ch = ((byte & 0x1f) << 6) + (bytes[ptr + 1] & 0x3f);
ptr++;
} else if (byte < 0x80) {
ch = byte;
} else {
free(string.buffer);
string.buffer = NULL;
errorHandler(UTF8_ERROR_MSG);
}
if (string.length >= string.size) {
string.size << 1;
uchar* buffer = (uchar*)malloc(sizeof(uchar) * string.size);
memcpy(buffer, string.buffer, string.length);
free(string.buffer);
string.buffer = buffer;
}
string.buffer[string.length++] = ch;
}
return string;
}
// unicode转utf-8
void printuchar(uchar ch) {
char temp[5];
if (ch < 0x80) {
putchar(ch);
} else if (ch < 0x800) {
temp[0] = (6 << 5) + ((ch & 0x7c0) >> 6);
temp[1] = (2 << 6) + (ch & 0x3f);
temp[2] = 0;
printf("%s", temp);
} else if (ch < 0x10000) {
temp[0] = (14 << 4) + ((ch & 0xf000) >> 12);
temp[1] = (2 << 6) + ((ch & 0xfc0) >> 6);
temp[2] = (2 << 6) + (ch & 0x3f);
temp[3] = 0;
printf("%s", temp);
} else if (ch < 0x110000) {
temp[0] = (0x1e << 3) + ((ch & 0x1c0000) >> 18);
temp[1] = (2 << 6) + ((ch & 0x3f000) >> 12);
temp[2] = (2 << 6) + ((ch & 0xfc0) >> 6);
temp[3] = (2 << 6) + (ch & 0x3f);
temp[4] = 0;
printf("%s", temp);
}
}