SqlServer-分词

Jackie_Mina

已于 2023-01-31 17:55:44 修改

阅读量817

点赞数 10

分类专栏：工作日常 SqlServer 文章标签： sqlserver 中文分词

于 2022-06-06 13:44:16 首次发布

本文链接：https://blog.csdn.net/weixin_36752088/article/details/125141424

版权

工作日常同时被 2 个专栏收录

5 篇文章 1 订阅

订阅专栏

SqlServer

1 篇文章 0 订阅

订阅专栏

字符串根据词性拆分分词

在工作中有的时候需要对人工填写的地址与数据库的地址进行匹配，有的时候需要对地址进行拆分，将地址规整。但多数情况下，人工填写是不规范的，因此一个根据词性拆分的方法，以便在拆分时能起到辅助作用。
如：南塘浜路11-15号 (虚拟)

序号	分词	词性	位置
1	南塘浜路	cn	1
2	11	num	5
3	-	symbol	7
4	15	num	8
5	号	cn	10

通过拆分，可以获得“-”前后的数字和文本，这样也便于将字符串规整为连续型地址，如：南塘浜路11号，南塘浜路12号…南塘浜路15号等。注：本文仅介绍拆分成分词的方法。

1/思路

(拿中文文本举例)如下表所示，对每个字符的词性进行判断，如果该字符不是中文(则不算其位置)，用字符原来的位置-判断字符为中文词性后的位置，得出来的结果就可以将字符串归类并拼接成分词。

字符	原顺序	中文字符再次排序	分词归类
南	1	1	1(=1-1+1)
塘	2	2	1(=2-2+1)
浜	3	3	1(=3-3+1)
路	4	4	1(=4-4+1)
1	5	-	-
1	6	-	-
-	7	-	-
1	8	-	-
5	9	-	-
号	10	5	6(=10-5+1)

2/创建表值函数-生成序列表

给字符串生成序列表，这样就获得了字符串原来的位置

create function [dbo].[fn_serial_numer]
(@n int)
returns table as
return
with t1 as(select 1 n union all select 1)
,t2 as(select 1 n from t1,t1 a,t1 b,t1 c)
,t3 as(select 1 n from t2,t2 a,t2 b,t2 c)
,t4 as(select 1 n from t3,t3 a)
select top(@n) row_number()over(order by(select 1)) n from t4 
order by n

select substring('南塘浜路11-15号',n,1)ch/*单个字符*/,n /*原始顺序*/
from fn_serial_numer(len('南塘浜路11-15号'))

ch	n
南	1
塘	2
浜	3
路	4
1	5
1	6
-	7
1	8
5	9
号	10

3/创建表值函数-字符串根据词性拆分成分词

create function  [dbo].[fn_split_by_property]
(
@str varchar(max)
)

returns  @table table(
pkid int/*序号*/
,keys varchar(max)/*拆分后的分词*/
,property varchar(32)/*词性*/
,indexs int/*位置*/)
as
begin 

--新建一个临时表
declare @tmptable table(pkid int,keys varchar(max),property varchar(32),indexs int)

/**1
分词归类
*/
;with t0 as (
		--将字符串拆分成一行行单个字符
		select substring(@str,n,1)ch/*单个字符*/,n /*原始顺序*/
		from fn_serial_numer(len(@str))
		)
,t1 as(/*只取中文字符*/
select ch/*单个字符*/
,n/*原始顺序*/
,id=row_number()over(order by n)/*这里对只取中文字符字符排序*/
,rid=n-row_number()over(order by n)+1 
,'cn' property /*词性*/
from t0 
where unicode(ch) between 19968 and 40869 /*判断是否是中文字符*/
)
,t2 as(/*只取数字*/
select ch,n
,id=row_number()over(order by n)
,rid=n-row_number()over(order by n)+1
,'num' property
from t0 
where ch like '%[0-9]%'
)
,t3 as(/*只取英文*/
select ch,n
,id=row_number()over(order by n)
,rid=n-row_number()over(order by n)+1
,'en' property
from t0 
where ch like '%[a-zA-Z]%'
)
,t4 as(/*除中文、数字、英文之外的字符串*/
select ch,n
,id=row_number()over(order by n)
,rid=n-row_number()over(order by n)+1
,'symbol' property
from t0  a
where not exists(select * from t1 where a.ch=ch)
and  not exists(select * from t2 where a.ch=ch)
and  not exists(select * from t3 where a.ch=ch)
)
,tmp as (
select* from t1 union 
select* from t2 union
select* from t3 union
select* from t4
)
/**
通过归类和词性，合并成一个个分词，但是顺序会打乱
如：南塘浜路  11  -   15    号 
*/
insert into @tmptable
        ( pkid, keys, property,indexs )
select pkid,keys,property,indexs
from(
		select row_number()over(order by (select 0))pkid/*随机生成序列，方便循环*/
		,rid
		,keys=(select  ''+ch from tmp b where b.rid=a.rid and b.property=a.property order by b.n for xml path('')) 
		,a.property /*词性*/
		,0 indexs /*关键字符串在原字符串的位置*/
		from tmp a 
		group by rid,a.property
		)tmp



/**2/获取分词在字符串中的位置
--循环，获得每个分词在原字符串中的位置，
--每次循环后，将上述取的位置在原字符串中替换成 ' '
--这样即使字符串中相同的分词，也会得到不同的位置
*/
declare @i int ,@str_tmp varchar(max),@keys varchar(max),@index int
set @i =1
set @str_tmp=@str
set @keys=''
set @index=1

while @i<=(select max(pkid) from @tmptable )
begin
set @keys=(select keys from @tmptable where pkid=@i)
set @index=charindex(@keys,@str_tmp)

update @tmptable
set indexs=@index
where pkid=@i

set @str_tmp=stuff (@str_tmp, @index, len(@keys), {fn repeat(' ',len(@keys))})
set @i=@i+1
end 



insert @table
        ( pkid,keys, property,  indexs )
select dense_rank()over(order by indexs) pkid
,keys,property,indexs 
from @tmptable

return

end

select*from dbo.fn_split_by_property('南塘浜路9009号,南塘浜路9999号')

序号	分词	词性	位置
1	南塘浜路	cn	1
2	9009	num	5
3	号	cn	9
4	,	symbol	10
5	南塘浜路	cn	11
6	9999	num	15
7	号	cn	19