Unicode 11.0 字符集
参考文献
下载标准
CJK Extension-A汉字范围 U+3400-4DBF
CJK Extension B汉字范围 U+20000-2A6DF
CJK Extension C汉字范围 U+2A700-2B73F
CJK Extension D汉字范围 U+2B740-2B81F
CJK Extension E汉字范围 U+2B820-2CEAF
CJK Extension F汉字范围 U+2CEB0-2EBE0
共计66899个汉字
--计算汉字起始值和数量
with cte as(
select (x'00003400'::bit(32))::integer as f1,(x'00004DB5'::bit(32))::integer as f2
union all
select (x'00020000'::bit(32))::integer as f1,(x'0002A6D6'::bit(32))::integer as f2
union all
select (x'0002A700'::bit(32))::integer as f1,(x'0002B734'::bit(32))::integer as f2
union all
select (x'0002B740'::bit(32))::integer as f1,(x'0002B81D'::bit(32))::integer as f2
union all
select (x'0002B820'::bit(32))::integer as f1,(x'0002CEA1'::bit(32))::integer as f2
union all
select (x'0002CEB0'::bit(32))::integer as f1,(x'0002EBE0'::bit(32))::integer as f2
)select f1,f2,f2-f1+1 from cte;
--生成汉字
with cte as(
select i,chr(i) as f1 from generate_series((x'00003400'::bit(32))::integer,(x'00004DB5'::bit(32))::integer) as i
union all
select i,chr(i) as f1 from generate_series((x'00020000'::bit(32))::integer,(x'0002A6D6'::bit(32))::integer) as i
union all
select i,chr(i) as f1 from generate_series((x'0002A700'::bit(32))::integer,(x'0002B734'::bit(32))::integer) as i
union all
select i,chr(i) as f1 from generate_series((x'0002B740'::bit(32))::integer,(x'0002B81D'::bit(32))::integer) as i
--CJK Extension E和CJK Extension F不支持的字符比较多,可以不要
--union all
--select i,chr(i) as f1 from generate_series((x'0002B820'::bit(32))::integer,(x'0002CEA1'::bit(32))::integer) as i
--union all
--select i,chr(i) as f1 from generate_series((x'0002CEB0'::bit(32))::integer,(x'0002EBE0'::bit(32))::integer) as i
)select count(*) from cte;
--精简后共计53664个汉字
实际的汉字正则表达式
[\u3400-\u4db5|\u20000-\u2a6d6|\u2a700-\u2b734|\u2b740-\u2b81d|\u2b820-\u2cea1|\u2ceb0-\u2ebe0]+
([\u3400-\u4db5|\u20000-\u2a6d6|\u2a700-\u2b734|\u2b740-\u2b81d|\u2b820-\u2cea1|\u2ceb0-\u2ebe0]+)
#精简版本
[\u3400-\u4db5|\u20000-\u2a6d6|\u2a700-\u2b734|\u2b740-\u2b81d]+
([\u3400-\u4db5|\u20000-\u2a6d6|\u2a700-\u2b734|\u2b740-\u2b81d]+)
绝大部分浏览器和系统能显示的汉字范围为
([\u3007\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA2D]+)