chrome字符串编码转换
字符串编码转换涉及宽字节表示法与UTF-8表示法之间的转换、宽字节表示法与UTF-16表示法之间的转换、UTF-8表示法与UTF-16表示法之间的转换、UTF-16表示法于ASCII表示法之间的转换、ASCII表示法宽字节表示法之间的转换。
相关文件
- base/strings/utf_string_conversions.h // 字符串编码转换定义
- base/strings/utf_string_conversions.cc // 字符串编码转换实现
- base/strings/utf_string_conversions_fuzzer.cc // 字符串编码转换实现
方法定义
// base/strings/utf_string_conversions.h
namespace base {
// These convert between UTF-8, -16, and -32 strings. They are potentially slow,
// so avoid unnecessary conversions. The low-level versions return a boolean
// indicating whether the conversion was 100% valid. In this case, it will still
// do the best it can and put the result in the output buffer. The versions that
// return strings ignore this error and just return the best conversion
// possible.
BASE_EXPORT bool WideToUTF8(const wchar_t* src, size_t src_len,
std::string* output);
BASE_EXPORT std::string WideToUTF8(WStringPiece wide) WARN_UNUSED_RESULT;
BASE_EXPORT bool UTF8ToWide(const char* src, size_t src_len,
std::wstring* output);
BASE_EXPORT std::wstring UTF8ToWide(StringPiece utf8) WARN_UNUSED_RESULT;
BASE_EXPORT bool WideToUTF16(const wchar_t* src,
size_t src_len,
std::u16string* output);
BASE_EXPORT std::u16string WideToUTF16(WStringPiece wide) WARN_UNUSED_RESULT;
BASE_EXPORT bool UTF16ToWide(const char16_t* src,
size_t src_len,
std::wstring* output);
BASE_EXPORT std::wstring UTF16ToWide(StringPiece16 utf16) WARN_UNUSED_RESULT;
BASE_EXPORT bool UTF8ToUTF16(const char* src,
size_t src_len,
std::u16string* output);
BASE_EXPORT std::u16string UTF8ToUTF16(StringPiece utf8) WARN_UNUSED_RESULT;
BASE_EXPORT bool UTF16ToUTF8(const char16_t* src,
size_t src_len,
std::string* output);
BASE_EXPORT std::string UTF16ToUTF8(StringPiece16 utf16) WARN_UNUSED_RESULT;
// This converts an ASCII string, typically a hardcoded constant, to a UTF16
// string.
BASE_EXPORT std::u16string ASCIIToUTF16(StringPiece ascii) WARN_UNUSED_RESULT;
// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII
// beforehand.
BASE_EXPORT std::string UTF16ToASCII(StringPiece16 utf16) WARN_UNUSED_RESULT;
#if defined(WCHAR_T_IS_UTF16)
// This converts an ASCII string, typically a hardcoded constant, to a wide
// string.
BASE_EXPORT std::wstring ASCIIToWide(StringPiece ascii) WARN_UNUSED_RESULT;
// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII
// beforehand.
BASE_EXPORT std::string WideToASCII(WStringPiece wide) WARN_UNUSED_RESULT;
#endif // defined(WCHAR_T_IS_UTF16)
// The conversion functions in this file should not be used to convert string
// literals. Instead, the corresponding prefixes (e.g. u"" for UTF16 or L"" for
// Wide) should be used. Deleting the overloads here catches these cases at
// compile time.
template <size_t N>
std::u16string WideToUTF16(const wchar_t (&str)[N]) {
static_assert(N == 0, "Error: Use the u\"...\" prefix instead.");
return std::u16string();
}
// TODO(crbug.com/1189439): Also disallow passing string constants in tests.
#if !defined(UNIT_TEST)
template <size_t N>
std::u16string ASCIIToUTF16(const char (&str)[N]) {
static_assert(N == 0, "Error: Use the u\"...\" prefix instead.");
return std::u16string();
}
// Mutable character arrays are usually only populated during runtime. Continue
// to allow this conversion.
template <size_t N>
std::u16string ASCIIToUTF16(char (&str)[N]) {
return ASCIIToUTF16(StringPiece(str));
}
#endif
} // namespace base
方法实现
// base/strings/utf_string_conversions.cc
namespace base {
namespace {
constexpr int32_t kErrorCodePoint = 0xFFFD;
// Size coefficient ----------------------------------------------------------
// The maximum number of codeunits in the destination encoding corresponding to
// one codeunit in the source encoding.
template <typename SrcChar, typename DestChar>
struct SizeCoefficient {
static_assert(sizeof(SrcChar) < sizeof(DestChar),
"Default case: from a smaller encoding to the bigger one");
// ASCII symbols are encoded by one codeunit in all encodings.
static constexpr int value = 1;
};
template <>
struct SizeCoefficient<char16_t, char> {
// One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
static constexpr int value = 3;
};
#if defined(WCHAR_T_IS_UTF32)
template <