Test Code
#include <iostream>
#include <tchar.h> /* _tprintf */
#define TEST 3
#if TEST == 1
#pragma execution_character_set("GBK")
int main() {
unsigned char buff[] = "壹贰叁肆伍陆柒捌玖十百千万";
printf("%s\n", buff);
int i = 0;
int count = 0;
while ((i < 65) && (buff[i] != '\0') && (count < 10))
{
if (buff[i] > 0xFC) { i += 6; }
else if (buff[i] > 0xF8) { i += 5; printf("0xF8 got\n");}
else if (buff[i] > 0xF0) { i += 4; printf("0xF0 got\n");}
else if (buff[i] > 0xE0) { i += 3; printf("0xE0 got\n");}
else if (buff[i] > 0xC0) { i += 2; printf("0xC0 got\n");}
else { i += 1; }
count++;
}
buff[i] = '\0';
printf("%d %d %s\n", i, count, buff);
}
#elif TEST == 2
#pragma execution_character_set("utf-8")
int main() {
system("chcp 65001");
unsigned char buff[] = "壹贰叁肆伍陆柒捌玖十百千万";
printf("%s\n", buff);
int i = 0;
int count = 0;
while ((i < 65) && (buff[i] != '\0') && (count < 10))
{
if (buff[i] > 0xFC) { i += 6; }
else if (buff[i] > 0xF8) { i += 5; printf("0xF8 got\n"); }
else if (buff[i] > 0xF0) { i += 4; printf("0xF0 got\n"); }
else if (buff[i] > 0xE0) { i += 3; printf("0xE0 got\n"); }
else if (buff[i] > 0xC0) { i += 2; printf("0xC0 got\n"); }
else { i += 1; }
count++;
}
buff[i] = '\0';
printf("%d %d %s\n", i, count, buff);
}
#elif TEST == 3
#pragma execution_character_set("GBK")
int main()
{
char buff[] = "壹贰叁肆伍陆柒捌玖十百千万";
_tprintf(_T("%S"), buff);
}
#endif
Test Result
TEST = 1:
壹贰叁肆伍陆柒捌玖十百千万
0xC0 got
0xC0 got
0xC0 got
0xC0 got
0xC0 got
0xC0 got
0xC0 got
17 10 壹贰叁肆伍陆柒捌?
TEST = 2:
Active code page: 65001
壹贰叁肆伍陆柒捌玖十百千万
0xE0 got
0xE0 got
0xE0 got
0xE0 got
0xE0 got
0xE0 got
0xE0 got
0xE0 got
0xE0 got
0xE0 got
30 10 壹贰叁肆伍陆柒捌玖十
TEST = 3:
壹贰叁肆伍陆柒捌玖十百千万
Summary
Point1 GBK为双字节编码中文,所以代码中的算法不适用, 每个字符固定2字节,按照这个理解可以简单的获取指定个数的字符
Point2 UTF-8 中文字节数不定,需要根据首字节位来判定本字符占用多少bit,故存在代码中的简单算法
Point3 注意代码中关于编码格式的设定
Thanks For Watching, Have a nice day!