/*
* Encoding.hpp
*
* Created on: 2023年10月12日
* Author: Ray
*/
#ifndef SRC_ENCODING_HPP_
#define SRC_ENCODING_HPP_
#include <string>
#include <iconv.h>
#include<cstring>
#include <iostream>
#include <sstream>
#include <iomanip>
namespace Ecode
{
// 将 UTF-8 编码的字符串转换为 GBK 编码
inline std::string utf8_to_gbk(const std::string& utf8_str) {
iconv_t cd = iconv_open("GBK", "UTF-8");
if (cd == (iconv_t)-1) {
return ""; // 初始化 iconv 失败
}
size_t utf8_len = utf8_str.length();
size_t gbk_len = utf8_len * 2; // 预留足够的空间存放结果
char *gbk_buf = new char[gbk_len + 1]; // 结果字符串内存缓冲区
memset(gbk_buf, 0, gbk_len + 1); // 初始化缓冲区,将每个字节设置为零
char *in_ptr = const_cast<char *>(utf8_str.c_str());
char *out_ptr = gbk_buf;
if (iconv(cd, &in_ptr, &utf8_len, &out_ptr, &gbk_len) == (size_t)-1) {
delete[] gbk_buf;
iconv_close(cd);
return ""; // 转换失败
}
gbk_buf[utf8_len * 2 - gbk_len] = '\0'; // 确保以 '\0' 结尾
iconv_close(cd);
std::string gbk_str(gbk_buf);
delete[] gbk_buf;
return gbk_str;
}
// 将 GBK 编码的字符串转换为 UTF-8 编码
inline std::string gbk_to_utf8(const std::string& gbk_str) {
iconv_t cd = iconv_open("UTF-8", "GBK");
if (cd == (iconv_t)-1) {
return ""; // 初始化 iconv 失败
}
size_t gbk_len = gbk_str.length();
size_t utf8_len = gbk_len * 3; // 预留足够的空间存放结果
char *utf8_buf = new char[utf8_len + 1]; // 结果字符串内存缓冲区
memset(utf8_buf, 0, utf8_len + 1); // 初始化缓冲区,将每个字节设置为零
char *in_ptr = const_cast<char *>(gbk_str.c_str());
char *out_ptr = utf8_buf;
if (iconv(cd, &in_ptr, &gbk_len, &out_ptr, &utf8_len) == (size_t)-1) {
delete[] utf8_buf;
iconv_close(cd);
return ""; // 转换失败
}
utf8_buf[gbk_len * 3 - utf8_len] = '\0'; // 确保以 '\0' 结尾
size_t utf8_str_len = strlen(utf8_buf);
std::string utf8_str(utf8_buf, utf8_str_len);
iconv_close(cd);
delete[] utf8_buf;
return utf8_str;
}
// 将UCS-2编码的字符串转换为 UTF-8 编码
inline std::string ucs2_to_utf8(const std::wstring& ucs2_str) {
std::string utf8_str;
for (const wchar_t& ucs2_char : ucs2_str) {
if (ucs2_char <= 0x7F) {
// 单字节编码
utf8_str += static_cast<char>(ucs2_char);
} else if (ucs2_char <= 0x7FF) {
// 两字节编码
utf8_str += static_cast<char>(0xC0 | (ucs2_char >> 6));
utf8_str += static_cast<char>(0x80 | (ucs2_char & 0x3F));
} else {
// 三字节编码
utf8_str += static_cast<char>(0xE0 | (ucs2_char >> 12));
utf8_str += static_cast<char>(0x80 | ((ucs2_char >> 6) & 0x3F));
utf8_str += static_cast<char>(0x80 | (ucs2_char & 0x3F));
}
}
return utf8_str;
}
// 将 UTF-8编码的字符串转换为UCS-2编码
inline std::wstring utf8_to_ucs2(const std::string& utf8_str) {
std::wstring ucs2_str;
size_t i = 0;
while (i < utf8_str.size()) {
unsigned char curr = utf8_str[i];
wchar_t ucs2_char;
if (curr <= 0x7F) {
// 单字节编码
ucs2_char = curr;
} else if ((curr & 0xE0) == 0xC0) {
// 两字节编码
wchar_t byte1 = static_cast<wchar_t>(curr & 0x1F);
wchar_t byte2 = static_cast<wchar_t>((utf8_str[i + 1]) & 0x3F);
ucs2_char = (byte1 << 6) | byte2;
i++;
} else if ((curr & 0xF0) == 0xE0) {
// 三字节编码
wchar_t byte1 = static_cast<wchar_t>(curr & 0x0F);
wchar_t byte2 = static_cast<wchar_t>((utf8_str[i + 1]) & 0x3F);
wchar_t byte3 = static_cast<wchar_t>((utf8_str[i + 2]) & 0x3F);
ucs2_char = (byte1 << 12) | (byte2 << 6) | byte3;
i += 2;
} else {
// 不支持的编码类型,可根据需求进行相应处理
// 这里简单地对不支持的编码类型返回空字符串
return L"";
}
ucs2_str += ucs2_char;
i++;
}
return ucs2_str;
}
void exampleEncode(){
std::string utf8_str = u8"<你好>"; // UTF-8编码的中文"你好"
std::cout << " 编译器UTF-8 编码字符串: ";
for (const char& character : utf8_str) {
std::cout <<"0x"<< std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(static_cast<unsigned char>(character)) << " ";
}
std::cout << std::endl;
std::wstring ucs2_string = L"<你好>"; // UCS-2 编码的字符串
std::cout << " 编译器UCS-2 编码字符串: ";
for (wchar_t c : ucs2_string) {
unsigned char byte1 = c >> 8; // 取高8位字节
unsigned char byte2 = c & 0xFF; // 取低8位字节
std::cout <<"0x"<< std::hex << std::setw(2)<< std::setfill('0') << static_cast<int>(byte1) << " ";
std::cout <<"0x"<< std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(byte2) << " ";
}
std::cout << std::endl;
//UTF-8转UCS-2
std::wstring ucs2_str = Ecode::utf8_to_ucs2(utf8_str);
// 按照一个字节一个字节打印宽字符字符串
std::cout << " UTF-8转UCS-2 编码的字符串: ";
for (wchar_t c : ucs2_str) {
unsigned char byte1 = c >> 8; // 取高8位字节
unsigned char byte2 = c & 0xFF; // 取低8位字节
std::cout <<"0x"<< std::hex << std::setw(2)<< std::setfill('0') << static_cast<int>(byte1) << " ";
std::cout <<"0x"<< std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(byte2) << " ";
}
std::cout << std::endl;
//UCS-2转UTF-8
std::string utf8_string = Ecode::ucs2_to_utf8(ucs2_string); // 转换为 UTF-8 编码的字符串
// 按照一个字节一个字节打印宽字符字符串
std::cout << " UCS-2转UTF-8 编码字符串: ";
for (const char& character : utf8_string) {
std::cout <<"0x"<< std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(static_cast<unsigned char>(character)) << " ";
}
std::cout << std::endl;
//UTF-8转GBK
std::string gbk_string = Ecode::utf8_to_gbk(utf8_string); // 转换为 GBK 编码的字符串
std::cout << " UTF-8转GBK 编码字符串: ";
for (const char& character : gbk_string) {
std::cout <<"0x"<< std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(static_cast<unsigned char>(character)) << " ";
}
std::cout << std::endl;
//GBK转UTF-8
std::string utf8_sring =Ecode::gbk_to_utf8(gbk_string);
std::cout << " GBK转UTF-8 编码字符串: ";
for (const char& character : utf8_sring) {
std::cout <<"0x"<< std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(static_cast<unsigned char>(character)) << " ";
}
std::cout << std::endl;
}
}
#endif /* SRC_ENCODING_HPP_ */
C++ 常用编码格式转换 UTF-8与GBK转换依赖iconv库
于 2023-10-12 17:01:30 首次发布