/*
UTF-8(Unicode Transformation Format-8)是一种可变长度的字符编码,它可以表示 Unicode 字符集中的所有字符。下面是 UTF-8 编码的基本规范:
分为四个区间:
0x0000 0000 至 0x0000 007F:0xxxxxxx
0x0000 0080 至 0x0000 07FF:110xxxxx 10xxxxxx
0x0000 0800 至 0x0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx
0x0001 0000 至 0x0010 FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
第一个字节位前面有几个1,表示该字符需要几个字符表示。
例如:110xxxxx 10xxxxxx 前面几位110 表示该字符需要2个字节
例如:1110xxxx 10xxxxxx 10xxxxxx 前面几位1110 表示该字符需要3个字节
同时第一个字节 为标识位 ,后面字节每个字节前2位都是10。
*/
#include <iostream>
#include <string>
#include <codecvt>
#include <algorithm>
#include "boost/locale.hpp"
#include <windows.h>
using namespace std;
namespace
{
using CODECVR_BYNAME_TYPE = std::codecvt_byname<wchar_t, char, mbstate_t>;
const char* GBK_LOCALE_NAME = ".936";
bool UTF8ToWide(std::wstring& wstr, const std::string& utf8)
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
wstr = converter.from_bytes(utf8);
return true;
}
bool GBKToWide(std::wstring& wstr, const std::string& gbk)
{
std::wstring_convert<CODECVR_BYNAME_TYPE> convert(new CODECVR_BYNAME_TYPE(GBK_LOCALE_NAME));
wstr = convert.from_bytes(gbk);
return true;
}
bool WideToUTF8(std::string& utf8, const std::wstring& wstr)
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
utf8 = converter.to_bytes(wstr);
return true;
}
bool WideToGBK(std::string& gbk, const std::wstring& wstr)
{
std::wstring_convert<CODECVR_BYNAME_TYPE> convert(new CODECVR_BYNAME_TYPE(GBK_LOCALE_NAME));
gbk = convert.to_bytes(wstr);
return true;
}
bool GBKToUTF8(std::string& utf8, const std::string& gbk)
{
std::wstring wstr;
GBKToWide(wstr, gbk);
WideToUTF8(utf8, wstr);
return true;
}
bool UTF8ToGBK(std::string& gbk, const std::string& utf8)
{
std::wstring wstr;
UTF8ToWide(wstr, utf8);
WideToGBK(gbk, wstr);
return true;
}
}
// 判断字符串是否为UTF-8编码
bool IsStringUTF8(const string& str)
{
unsigned char bytes = 0; // UFT8可用1-6个字节编码,ASCII用一个字节
unsigned char chr;
for (size_t i = 0; i < str.size(); ++i)
{
chr = str.at(i);
// 若是不是ASCII码,应该是多字节符,计算字节数
if (0 == bytes)
{
if (chr >= 0x80)
{
if (chr >= 0xFC && chr <= 0xFD)
bytes = 6;
else if (chr >= 0xF8)
bytes = 5;
else if (chr >= 0xF0)
bytes = 4;
else if (chr >= 0xE0)
bytes = 3;
else if (chr >= 0xC0)
bytes = 2;
else
return false;
bytes--;
}
}
else // 多字节符的非首字节,应为 10xxxxxx
{
if ((chr & 0xC0) != 0x80)
return false;
bytes--;
}
}
// 违返规则
if (bytes > 0) {
return false;
}
return true;
}
int UTF8StringSize(const std::string& str)
{
// std::wstring wstr;
// UTF8ToWide(wstr, str);
// return wstr.size();
return MultiByteToWideChar(CP_UTF8, NULL, str.c_str(), str.size(), NULL, NULL);
}
// 一个字符的字节数
int GetUTF8Bytes(const char chr)
{
int bytes = 0;
if (chr >= 0xFC && chr <= 0xFD)
bytes = 6;
else if (chr >= 0xF8)
bytes = 5;
else if (chr >= 0xF0)
bytes = 4;
else if (chr >= 0xE0)
bytes = 3;
else if (chr >= 0xC0)
bytes = 2;
else if (chr < 0x80)
bytes = 0;
return bytes;
}
// UTF-8字符串截取指定长度的字符
std::string UTF8StringSafeTruncate(const std::string& name, const size_t need_size)
{
size_t i = 0;
size_t j = 0;
// 获取指定长度的字符
while (i < need_size && j < name.length()) {
unsigned char c = (unsigned char)name[j++];
i += ((c & 0xc0) != 0x80);
}
// 移动到下一个字符
while (j < name.length()) {
unsigned char c = (unsigned char)name[j];
if ((c & 0xc0) == 0x80) {
j++;
}
else {
break;
}
}
return name.substr(0, j);
}
std::string ToLower_transform1(const std::string& str)
{
std::string str_tmp = str;
std::transform(str_tmp.begin(), str_tmp.end(), str_tmp.begin(), ::tolower);
return str_tmp;
}
std::string ToUpper_transform1(const std::string& str)
{
std::string str_tmp = str;
std::transform(str_tmp.begin(), str_tmp.end(), str_tmp.begin(), ::toupper);
return str_tmp;
}
std::string ToLower_transform2(const std::string& str)
{
std::wstring wstr_tmp;
UTF8ToWide(wstr_tmp, str);;
std::transform(wstr_tmp.begin(), wstr_tmp.end(), wstr_tmp.begin(), ::tolower);
std::string str_tmp;
WideToUTF8(str_tmp, wstr_tmp);;
return str_tmp;
}
std::string ToUpper_transform2(const std::string& str)
{
std::wstring wstr_tmp;
UTF8ToWide(wstr_tmp, str);;
std::transform(wstr_tmp.begin(), wstr_tmp.end(), wstr_tmp.begin(), ::toupper);
std::string str_tmp;
WideToUTF8(str_tmp, wstr_tmp);;
return str_tmp;
}
std::string ToLower_boost(const std::string& str)
{
static boost::locale::generator generator;
static std::locale locale = generator("en_US.UTF-8");
std::wstring wstr = boost::locale::conv::to_utf<wchar_t>(str, "utf-8");
wstr = boost::locale::to_lower(wstr, locale);
return boost::locale::conv::from_utf(wstr, "utf-8");
}
std::string ToUpper_boost(const std::string& str)
{
static boost::locale::generator generator;
static std::locale locale = generator("en_US.UTF-8");
std::wstring wstr = boost::locale::conv::to_utf<wchar_t>(str, "utf-8");
wstr = boost::locale::to_upper(wstr, locale);
return boost::locale::conv::from_utf(wstr, "utf-8");
}
int main()
{
std::string str_gb2312 = "中国";
std::string str_utf8;
GBKToUTF8(str_utf8, str_gb2312);
bool utf8 = IsStringUTF8(str_gb2312);
bool utf8_2 = IsStringUTF8(str_utf8);
int num = UTF8StringSize(str_utf8);
std::string str_utf8_truncate = UTF8StringSafeTruncate(str_utf8, 1);
std::string str_gb2312_truncate;
UTF8ToGBK(str_gb2312_truncate, str_utf8_truncate);
std::string str_lower = "ABC";
std::string str_upper = "def";
std::string str_lower1 = ToLower_transform1(str_lower);
std::string str_upper1 = ToUpper_transform1(str_upper);
std::string str_lower2 = ToLower_transform2(str_lower);
std::string str_upper2 = ToUpper_transform2(str_upper);
std::string str_lower3 = ToLower_boost(str_lower);
std::string str_upper3 = ToUpper_boost(str_upper);
return 0;
}
04-28
1693
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)
08-21
7634
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)