Postgres2015全国用户大会将于11月20至21日在北京丽亭华苑酒店召开。本次大会嘉宾阵容强大,国内顶级PostgreSQL数据库专家将悉数到场,并特邀欧洲、俄罗斯、日本、美国等国家和地区的数据库方面专家助阵:
- Postgres-XC项目的发起人铃木市一(SUZUKI Koichi)
- Postgres-XL的项目发起人Mason Sharp
- pgpool的作者石井达夫(Tatsuo Ishii)
- PG-Strom的作者海外浩平(Kaigai Kohei)
- Greenplum研发总监姚延栋
- 周正中(德哥), PostgreSQL中国用户会创始人之一
- 汪洋,平安科技数据库技术部经理
- ……
|
|
前面转了一篇关于字符编码的文章,有兴趣的朋友可以阅读一下。
本文介绍一下PostgreSQL是如何检查多字节字符的合法性的,以及在字符集之间是如何转换的?
我们以UTF8为例,先看看它的编码规则。
UTF-8是unicode编码的实现方式之一,相比UTF-16的好处是解决了兼容ASCII码时有一个字节为0x00的问题,UTF-8它是变长编码,所以必须要有方法能够知道一个字符的字节数。
规则见下图,
当首位为0时,兼容ASCII码,即1个字节。
当首位是110时,即2个1一个0,表示这个字符占用2个字节,并且第二个字节一定是10开头。
当首位是1110时,即3个1一个0,表示这个字符占用3个字节,并且第2,3个字节一定是10开头。
当首位是11110时,即3个1一个0,表示这个字符占用4个字节,并且第2,3,4个字节一定是10开头。
Char. number range | UTF-8 octet sequence (hexadecimal) | (binary) --------------------+--------------------------------------------- 0000 0000-0000 007F | 0xxxxxxx 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
实际上UTF-8最多可以占用6个字节,不过目前应该实际在用的是4个字节。并且PG也是这么来检测的,后面会看到。
UTF-8的编码规则(占用1,2,3,4个字节时的取值范围)就决定了它的合法性检测方法。
首先是当前字符编码长度的判断,方法如下:
src/backend/utils/mb/wchar.c
/** Return the byte length of a UTF8 character pointed to by s** Note: in the current implementation we do not support UTF8 sequences* of more than 4 bytes; hence do NOT return a value larger than 4.* We return "1" for any leading byte that is either flat-out illegal or* indicates a length larger than we support.** pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps* other places would need to be fixed to change this.*/intpg_utf_mblen(const unsigned char *s){int len;
if ((*s & 0x80) == 0) // 即10000000 & ? == 0len = 1;else if ((*s & 0xe0) == 0xc0) // 即11100000 & ? == 11000000len = 2;else if ((*s & 0xf0) == 0xe0) // 即11110000 & ? == 11100000len = 3;else if ((*s & 0xf8) == 0xf0) // 即11111000 & ? == 11110000len = 4;#ifdef NOT_USED // 6个字节不可用,在PostgreSQL中没有定义NOT_USED宏,所以以下代码忽略。else if ((*s & 0xfc) == 0xf8)len = 5;else if ((*s & 0xfe) == 0xfc)len = 6;#endifelse // 其他则返回长度1,让下面的校验程序去完成校验。len = 1;return len;}
下面是校验合法性的程序。
static intpg_utf8_verifier(const unsigned char *s, int len){int l = pg_utf_mblen(s);
if (len < l)return -1;
if (!pg_utf8_islegal(s, l))return -1;
return l;}
合法性的检查需要结合字符长度。
/** Check for validity of a single UTF-8 encoded character** This directly implements the rules in RFC3629. The bizarre-looking* restrictions on the second byte are meant to ensure that there isn't* more than one encoding of a given Unicode character point; that is,* you may not use a longer-than-necessary byte sequence with high order* zero bits to represent a character that would fit in fewer bytes.* To do otherwise is to create security hazards (eg, create an apparent* non-ASCII character that decodes to plain ASCII).** length is assumed to have been obtained by pg_utf_mblen(), and the* caller must have checked that that many bytes are present in the buffer.*/boolpg_utf8_islegal(const unsigned char *source, int length){unsigned char a;
switch (length){default:/* reject lengths 5 and 6 for now */return false;case 4: // 当前字符的编码长度是4时,最后一个字节的取值范围0x80 ~ 0xBF (即10xxxxxx的取值区间)a = source[3];if (a < 0x80 || a > 0xBF)return false;/* FALL THRU */case 3: // 当前字符的编码长度是3时,最后一个字节的取值范围0x80 ~ 0xBF (即10xxxxxx的取值区间)a = source[2];if (a < 0x80 || a > 0xBF)return false;/* FALL THRU */case 2: // 当前字符的编码长度是2时,需根据最后一个字节的值来判断第一个字节的取值范围。a = source[1];switch (*source){case 0xE0:if (a < 0xA0 || a > 0xBF)return false;break;case 0xED:if (a < 0x80 || a > 0x9F)return false;break;case 0xF0:if (a < 0x90 || a > 0xBF)return false;break;case 0xF4:if (a < 0x80 || a > 0x8F)return false;break;default:if (a < 0x80 || a > 0xBF)return false;break;}/* FALL THRU */case 1:a = *source;if (a >= 0x80 && a < 0xC2)return false;if (a > 0xF4)return false;break;}return true;}
每种字符集的编码规则不同,所以校验方法也不同。这里不一一列举。
PostgreSQL字符集的转换方法。
postgres=# \df convert*List of functionsSchema | Name | Result data type | Argument data types | Type------------+--------------+------------------+---------------------+--------pg_catalog | convert | bytea | bytea, name, name | normalpg_catalog | convert_from | text | bytea, name | normalpg_catalog | convert_to | bytea | text, name | normal(3 rows)
我们使用bytea和转换函数来验证以上检测程序。
postgres=# \set VERBOSITY verbosepostgres=# select convert_from(bytea '\x80', 'UTF8');ERROR: 22021: invalid byte sequence for encoding "UTF8": 0x80LOCATION: report_invalid_encoding, wchar.c:2011postgres=# select convert_from(bytea '\xE28080', 'UTF8');convert_from--------------?(1 row)postgres=# select convert_from(bytea '\xE08080', 'UTF8');ERROR: 22021: invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80LOCATION: report_invalid_encoding, wchar.c:2011
最终调用的是以下函数:
pg_catalog | ascii_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | ascii_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | big5_to_euc_tw | void | integer, integer, cstring, internal, integer | normalpg_catalog | big5_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | big5_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_cn_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_cn_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_jis_2004_to_shift_jis_2004 | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_jis_2004_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_jp_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_jp_to_sjis | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_jp_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_kr_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_kr_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_tw_to_big5 | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_tw_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | euc_tw_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | gb18030_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | gbk_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | iso8859_1_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | iso8859_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | iso_to_koi8r | void | integer, integer, cstring, internal, integer | normalpg_catalog | iso_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | iso_to_win1251 | void | integer, integer, cstring, internal, integer | normalpg_catalog | iso_to_win866 | void | integer, integer, cstring, internal, integer | normalpg_catalog | johab_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | koi8r_to_iso | void | integer, integer, cstring, internal, integer | normalpg_catalog | koi8r_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | koi8r_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | koi8r_to_win1251 | void | integer, integer, cstring, internal, integer | normalpg_catalog | koi8r_to_win866 | void | integer, integer, cstring, internal, integer | normalpg_catalog | koi8u_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | latin1_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | latin2_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | latin2_to_win1250 | void | integer, integer, cstring, internal, integer | normalpg_catalog | latin3_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | latin4_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_ascii | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_big5 | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_euc_cn | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_euc_jp | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_euc_kr | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_euc_tw | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_iso | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_koi8r | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_latin1 | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_latin2 | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_latin3 | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_latin4 | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_sjis | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_win1250 | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_win1251 | void | integer, integer, cstring, internal, integer | normalpg_catalog | mic_to_win866 | void | integer, integer, cstring, internal, integer | normalpg_catalog | shift_jis_2004_to_euc_jis_2004 | void | integer, integer, cstring, internal, integer | normalpg_catalog | shift_jis_2004_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | sjis_to_euc_jp | void | integer, integer, cstring, internal, integer | normalpg_catalog | sjis_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | sjis_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | uhc_to_utf8 | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_ascii | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_big5 | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_euc_cn | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_euc_jis_2004 | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_euc_jp | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_euc_kr | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_euc_tw | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_gb18030 | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_gbk | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_iso8859 | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_iso8859_1 | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_johab | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_koi8r | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_koi8u | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_shift_jis_2004 | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_sjis | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_uhc | void | integer, integer, cstring, internal, integer | normalpg_catalog | utf8_to_win | void | integer, integer, cstring, internal, integer | normalpg_catalog | win1250_to_latin2 | void | integer, integer, cstring, internal, integer | normalpg_catalog | win1250_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | win1251_to_iso | void | integer, integer, cstring, internal, integer | normalpg_catalog | win1251_to_koi8r | void | integer, integer, cstring, internal, integer | normalpg_catalog | win1251_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | win1251_to_win866 | void | integer, integer, cstring, internal, integer | normalpg_catalog | win866_to_iso | void | integer, integer, cstring, internal, integer | normalpg_catalog | win866_to_koi8r | void | integer, integer, cstring, internal, integer | normalpg_catalog | win866_to_mic | void | integer, integer, cstring, internal, integer | normalpg_catalog | win866_to_win1251 | void | integer, integer, cstring, internal, integer | normalpg_catalog | win_to_utf8 | void | integer, integer, cstring, internal, integer | normal
代码在 :
src/backend/utils/mb/conversion_procs
PostgreSQL服务端支持的字符集,以及客户端支持的字符集。
[参考]
src/backend/utils/mb/conversion_procs
src/include/mb/pg_wchar.h
src/backend/utils/mb/wchar.c