PostgreSQL 黑科技-递规二分法切分汉字

最新推荐文章于 2023-08-29 14:53:05 发布

kmblack1

最新推荐文章于 2023-08-29 14:53:05 发布

阅读量760

点赞数 1

文章标签： PostgreSQL 黑科技模糊查询全文检索正则表达式

本文链接：https://blog.csdn.net/kmblack1/article/details/78929862

版权

本文是PostgreSQL-模糊查询的改进版本，改进的地方是把数字、英文单词单独提取现来，不做切分处理。

1.创建清除文本中的标点符号函数

目前我能想到的标点符号是以下这些，欢迎补充。

--不擦除空格、下划线、减号
drop function if exists clear_punctuation(text);
create or replace function clear_punctuation(text)
  returns text
as $$
    select regexp_replace($1,
	'[\~|\`|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\+|\=|\||\\|\[|\]|\{|\}|\;|\:|\"|\''|\,|\<|\.|\>|\/|\?|\：|\。|\；|\，|\：|\“|\”|\（|\）|\、|\？|\《|\》]'
	,' ','g');
$$ language sql strict immutable;

2.黑科技-递规二分法切分汉字

汉字每两个字符做为一个词处理。例如“调用函数”经过切分后，结果为3个词，分别是“调用、用函、函数”。

/*
* 切分电话号码
* $1:为完整电话号
* 用途：当要查询电话号时，我们可能只记得其中的几个数字，每2个字做为一个电话号码关键词时，查询的数据较多。
* 因此选择每3个字做一个电话号码关键字，只要记得电话号码中的任意3个数字即可查询所有包含这3个数字的记录
*/
drop function if exists split_mobile(text);
create or replace function split_mobile(text)
    returns table(val text)
as $$
	with recursive cte(pos,val) as(
		values( 2,substring($1,1,3))
		union all
		select  (pos+1) as pos,(select substring($1,pos,3) ) as val
			from cte as rec(pos,val) where char_length(val) = 3
	)select val from cte where char_length(val) = 3;
$$ language sql;

/*
* 切分汉字
* $1:要切分的汉字
* 汉字每两个字符做为一个词处理。例如“调用函数”经过切分后，结果为3个词，分别是“调用、用函、函数”。
*/
drop function if exists dichotomy_split_sql(text);
create or replace function dichotomy_split_sql(text)
    returns table(val text)
as $$
	with recursive cte(pos,val) as(
		values( 2,substring($1,1,2))
		union all
		select  (pos+1) as pos,(select substring($1,pos,2) ) as val
			from cte as rec(pos,val) where char_length(val) = 2
	)select val from cte where char_length(val) = 2;
$$ language sql;

3.创建数组转换为行函数

drop function if exists array_to_rows(text[]);
create or replace function array_to_rows(text[])
    returns table(val text)
as $$
	select val from unnest( $1 ) AS val;
$$ language sql;

4.提取数字、英文单词和切分汉字结果

drop function if exists get_matche_results(text);
create or replace function get_matche_results(text)
returns table(val text)
as $$
	with mobile as(
		--提取手机号
		select a from regexp_matches($1,'1\d{10}','g') as a
	),msplit as(
		select split_mobile(array_to_rows(a)) as b from mobile
	),marray as(
		select array_agg(b) as c from msplit
	),num_en as(
		--获取数字、英文单词
		select e from regexp_matches($1, '-?\d+|[a-zA-Z_]+','g') as e
	),txt as(
		--提取除数字、英文单词、空格之外的所有字符
		select regexp_replace(f,'[\ ]','','g') as f from regexp_split_to_table($1, '-?\d+|[a-zA-Z_\ ]+') as f
	),arr as(
		select c from marray
		union all
		select e from num_en
		union all
		select (select array_agg(g) from dichotomy_split_sql(f) as g) from txt where f<>''
	) select array_to_rows(val) as h from arr as f(val) group by h order by h;
$$ language sql;

--使用方法
select * from get_matche_results(clear_punctuation('调用函数 SRF_PERCALL_SETUP() and work 为使用FuncCallContext做恰当的设置以及清除-2239768任何前面的轮9987回里面身下的已返回的数据。13000000000'));

5.转换为tsvector

drop function if exists dichotomy_split_tsv(text);
create or replace function dichotomy_split_tsv(text)
    returns tsvector
as $$
	select array_to_tsvector(array_agg(val)) from get_matche_results($1) as val;
$$ language sql;

--使用方法
select * from dichotomy_split_tsv(clear_punctuation('调用函数 SRF_PERCALL_SETUP() and work 为使用FuncCallContext做恰当的设置以及清除-2239768任何前面的轮9987回里面身下的已返回的数据。13000000000'));

6.转换为tsquery

drop function if exists dichotomy_split_tsq(text,boolean);
create or replace function dichotomy_split_tsq(text,boolean default true)
    returns tsquery
as $$
	select string_agg(val, (case when $2 then '&' else '|' end ) )::tsquery from get_matche_results($1) as val;
$$ language sql;

--使用方法
select * from dichotomy_split_tsq(clear_punctuation('调用函数 SRF_PERCALL_SETUP() and work 为使用FuncCallContext做恰当的设置以及清除-2239768任何前面的轮9987回里面身下的已返回的数据。13000000000'));