UTF-8 转 UNICODE , C 代码 , 自用 , 转发注明出处
2020/12/5 修改4字节转化代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "stdint.h"
#include "stdlib.h"
#include "string.h"
#include "stdio.h"
#pragma warning(disable:4996)
#define UNICODE_1_BYTE_MASK 0x80
#define UNICODE_1_BYTE_MASK_VALUE 0X00
#define UNICODE_2_BYTE_MASK 0xe0
#define UNICODE_2_BYTE_MASK_VALUE 0XC0
#define UNICODE_3_BYTE_MASK 0xf0
#define UNICODE_3_BYTE_MASK_VALUE 0XE0
#define UNICODE_4_BYTE_MASK 0xf0
#define UNICODE_4_BYTE_MASK_VALUE 0XF0
#define UNICODE_COMMON_MASK 0XC0
#define UNICODE_COMMON_MASK_VALUE 0X80
void toUnicode(char* a, char* r) {
char* temp;
temp = (char*)malloc(strlen(a)*6);
if (temp == NULL) {
return;
}
uint32_t index = 0;
uint32_t index2 = 0;
uint32_t unicode = 0;
uint32_t len = 0;
while(*(uint8_t *)(a+index2)!=0) {
if ((*(uint8_t*)(a + index2) & (UNICODE_1_BYTE_MASK)) == UNICODE_1_BYTE_MASK_VALUE) {// 1byte
printf(" %02x \n", *(uint8_t*)(a + index2));
unicode = *(uint8_t*)(a + index2);
len = sprintf(temp + index, "\\u%04x", unicode);
index2 += 1;
}
else if ((*(uint8_t*)(a + index2) & (UNICODE_2_BYTE_MASK)) == UNICODE_2_BYTE_MASK_VALUE) {//2byte
printf(" %02x %02x \n", *(uint8_t*)(a + index2) , *(uint8_t*)(a + index2+1));
if ((*(uint8_t*)(a + index2 + 1) & (UNICODE_COMMON_MASK)) == UNICODE_COMMON_MASK_VALUE) {
unicode = (((*(uint8_t*)(a + index2)) & 0x1f) << 6) | ((*(uint8_t*)(a + index2+1)) & 0x3f);
len = sprintf(temp + index, "\\u%04x", unicode);
index2 += 2;
}
else {
printf("error 2");
break;
//errors
}
}
else if ((*(uint8_t*)(a + index2) & (UNICODE_3_BYTE_MASK)) == UNICODE_3_BYTE_MASK_VALUE) { //3byte
printf(" %02x %02x %02x \n", *(uint8_t*)(a + index2), *(uint8_t*)(a + index2 + 1), *(uint8_t*)(a + index2 + 2));
if (((*(uint8_t*)(a + index2 + 1) & (UNICODE_COMMON_MASK)) == UNICODE_COMMON_MASK_VALUE) &&((*(uint8_t*)(a + index2 + 2) & (UNICODE_COMMON_MASK)) == UNICODE_COMMON_MASK_VALUE)){
unicode = (((*(uint8_t*)(a + index2)) & 0x0f) << 12) | (((*(uint8_t*)(a + index2 + 1)) & 0x3f)<<6)| ((*(uint8_t*)(a + index2 + 2)) & 0x3f) ;
len = sprintf(temp + index, "\\u%04x", unicode);
index2 += 3;
}
else {
printf("error 3");
break;
//errors
}
}
else if ((*(uint8_t*)(a + index2) & (UNICODE_4_BYTE_MASK)) == UNICODE_4_BYTE_MASK_VALUE) {
printf(" %02x %02x %02x %02x\n", *(uint8_t*)(a + index2), *(uint8_t*)(a + index2 + 1), *(uint8_t*)(a + index2 + 2), *(uint8_t*)(a + index2 + 3));
if (((*(uint8_t*)(a + index2 + 1) & (UNICODE_COMMON_MASK)) == UNICODE_COMMON_MASK_VALUE) && ((*(uint8_t*)(a + index2 + 2) & (UNICODE_COMMON_MASK)) == UNICODE_COMMON_MASK_VALUE) && ((*(uint8_t*)(a + index2 + 3) & (UNICODE_COMMON_MASK)) == UNICODE_COMMON_MASK_VALUE)) {
unicode = (((*(uint8_t*)(a + index2)) & 0x0f) << 18)|(((*(uint8_t*)(a + index2+1)) & 0x3f) << 12) | (((*(uint8_t*)(a + index2 + 2)) & 0x3f) << 6) | ((*(uint8_t*)(a + index2 + 3)) & 0x3f);
len = sprintf(temp + index, "\\u%04x", unicode);
index2 += 4;
}
else {
printf("error 4");
break;
//errors
}
}
else {
printf("error 5");
break;
}
index += len;
}
strcpy(r, temp);
free(temp);
}