只读 utf-8 每个字最多由 4 个字节组成的情况。
#include <iostream>
#include <string>
#include <assert.h>
#include <vector>
std::vector <std::string> read_utf8_onebyone(char *chars) {
std::vector<std::string> words;
std::string input(chars);
int len = input.length();
int i = 0;
while (i < len) {
assert ((input[i] & 0xF8) <= 0xF0);
int next = 1;
if ((input[i] & 0x80) == 0x00) {
std::cout << "one character: " << input[i] << std::endl;
} else if ((input[i] & 0xE0) == 0xC0) {
next = 2;
std::cout << "two character: " << input.substr(i, next) << std::endl;
} else if ((input[i] & 0xF0) == 0xE0) {
next = 3;
std::cout << "three character: " << input.substr(i, next) << std::endl;
} else if ((input[i] & 0xF8) == 0xF0) {
next = 4;
std::cout << "four character: " << input.substr(i, next) << std::endl;
}
words.push_back(input.substr(i, next));
i += next;
}
return words;
}