之前的函数,基本上满足了获取地域信息的需求,但是,存在的最严重的问题就是效率问题,每次调用PickCityId函数时,都要从新读取一遍dict_cities表,这个效率就很感人了,所以,优化势在必行
考虑到之前已经建立了正则clr,老顾觉得,用正则作为优化依据比较靠谱
比如这样一个正则“(?#1:0|215|1)北京|(?#3:0|155|3)上海”,可以直接进行匹配北京和上海的信息,而?#为正则注释,这个里边的内容不参与正则运算,但匹配后,我们可以对原始正则字符串再次匹配,比如匹配到北京,我们可以根据原始正则字符串,拿到注释里的内容,嗯,就是这么个思路,这样,每次使用PickCityId方法前,只需要读一次数据库,用来生成正则即可
CREATE FUNCTION [dbo].[CityIdReg]
(
)
RETURNS nvarchar(max)
AS
BEGIN
DECLARE @reg nvarchar(max)
select @reg = stuff((
select '|(?#'+convert(varchar,city_id)+':'+city_path+')'+n
from dict_cities ci with (nolock)
cross apply (
select isnull(
(case
when master.dbo.regexismatch(isnull(abbr,''),'[a-z]')=1
then city_name else abbr end
)
,master.dbo.regexreplace(city_name,'[矿新]区$|自治.*$|[县区]$','')) as n
) b
where city_del=0
and isnull(targetId,0)=0
and datalength(n)>0
order by n desc -- 为了避免出现异常,对条目进行排序,避免出现深圳匹配不到,却匹配到深县(因为县字已被删除)
for xml path('')
),1,1,'')
RETURN @reg
END
通过一个自定义函数,直接返回一个很长的正则字符串,除匹配内容外,还包含了地域ID和path,嗯,都在注释里
然后,改造一下我们的PickCityId函数
ALTER FUNCTION [dbo].[PickCityId]
(
@xml xml,@reg nvarchar(max)
)
RETURNS int
AS
BEGIN
DECLARE @id int
declare @d table(addr nvarchar(max),wsn int)
insert into @d
select
replace(match,'杨凌','杨陵') as addr
,sn as wsn
from master.dbo.regexmatches(convert(nvarchar(max),@xml),'(?<=\w+=")[^"]+')
declare @t2 table(
addr nvarchar(max),wsn int,city_id int,city_path varchar(100),loc nvarchar(100)
,sn int,match nvarchar(100),pos int,mlen int
)
insert into @t2
select addr,wsn,city_id,city_path,b.match,b.sn,b.match,b.[index],datalength(b.match)
from @d a
cross apply (
select * from master.dbo.RegexMatches(addr,@reg)
) b
cross apply (
-- 因为有多个类似朝阳、钟山、海南这样的结果,所以将所有结果都提取出来
select match as n
from master.dbo.RegexMatches(@reg,'(?<=^|[\|])\([^\)]+\)'+b.match+'(?![东南西北中]?大?[路街村镇乡]|大道)'+'(?=$|[\|])')
) c
cross apply (
select master.dbo.RegexMatch(n,'(?<=\(\?#)\d+') as city_id
,master.dbo.RegexMatch(n,'(?<=\(\?#\d+:)[\d\|]+') as city_path
) d
declare @t3 table(city_id int,city_path varchar(100),sn int,pos int,mlen int,lv int,wsn int,match nvarchar(100))
insert into @t3
select city_id,city_path,sn,pos,mlen,lv,wsn,match
from @t2
cross apply (
select count(0) as lv
from master.dbo.splitstr(city_path,'|')
) b
where lv>2 and master.dbo.regexismatch(match,'^[东南西北中]$')=0
declare @t4 table(city_id int,city_path varchar(100),sn int,pos int,mlen int,lv int,wsn int,match nvarchar(100))
insert into @t4
select top 1 * from @t3 order by pos,mlen desc,lv
declare @t5 table(city_id int,city_path varchar(100),sn int,pos int,mlen int,lv int,wsn int,match nvarchar(100))
insert into @t5
select t3.*
from @t4 t4
left join @t3 t3 on charindex(t4.city_path,t3.city_path)>0
where not (t3.lv<>t4.lv and t3.pos=t4.pos and t3.mlen<=t4.mlen and t3.wsn=t4.wsn)
;with t6 as (
select *,(case
when exists(select top 1 1 from @t5 where match=a.match and lv<a.lv) then 1
when exists(select top 1 1 from @t5 where charindex(a.match,match)>0 and lv<a.lv) then 2
else 0 end
) as same
from @t5 a
cross apply (
select count(0) as p
from @t5
where city_id<>a.city_id and charindex(city_path,a.city_path)>0
) b
where not exists(
select top 1 1
from @t5
where pos=a.pos and lv<a.lv and wsn=a.wsn and match=a.match and lv>3
)
)
select @id = (
select top 1 city_id
from t6 a
order by a.p desc,same,pos,mlen desc
)
RETURN @id
END
好了,第一次优化完成
效率从原来匹配100个地域信息需要3分钟多,变成了现在需要40秒左右。。。。。
嗯,继续求各位大佬提供优化思路