本文实现了一个按照指定百分位的数据生成等距的序列值,分位数的数据按照每增长一个百分点分一组,如果分组的数据没有值自动会合并到下一组,直到满足条件的分组,上限100组哦.
TODO优化:动态分组,并且不限制分组数量.
思路1 将分组后的数据进行自关联(需要转成行记录),拿到本组以及下一个分组的数值,然后和需要分组的表进行关联
当需要分组的字段落地对应的分组区间的时候(此处采用不等值连接,不知道pgsql是否支持,如果不支持采用where过滤吧)
思路2 使用存储过程或者UDF函数,进行数据的遍历判断数值所在的分组,
思路3 将分组转换成字典或者是Map,将数值映射成整数值(四舍五入)假定为m,然后从分组表中取出字典对应的数值,如果有值则取,否则就看下是取最小分组还是取最大分组. 此方法时间复杂度上是最优的,时间复杂度为O(1).
在判断某个元素是否在某个序列中的时候,dict是O(1),list需要遍历,所以是O(n),这时候尽量不要用list,能够用字典进行存储,尽量不要用list。
注意所有细节部分,需要测试,避免出错.
看得懂的拿走,不谢!
CREATE OR REPLACE FUNCTION cust_fdss_calc_fzld_group_fun_step2(current_view_id varchar,current_col varchar) RETURNS void AS
$BODY$
Declare sqlstring varchar ;
BEGIN
-- 删除历史数据
EXECUTE 'delete from cust_fdss_calc_fzld_branch_percent where view_id =''' || current_view_id || ''' and branch=''' || current_col || ''';' ;
-- step 4.1.1 计算分位数对应的数值
sqlstring:='insert into cust_fdss_calc_fzld_branch_percent
select
array_agg(ix) as percent -- 将序列值转成数组 ,并计算数值的最大值
,max(ix) as max_percent
, ' || quote_literal(current_view_id ) || '::varchar as view_id
, ' || quote_literal(current_col )|| '::varchar as branch
from (
select generate_series(cast(percentile_disc[1]*100 as integer),cast(percentile_disc[2]*100 as integer), 1)/100.0::numeric as ix -- 根据数值的最小和最大值生成序列值
from ( SELECT percentile_disc(ARRAY[0.005,0.995]) WITHIN GROUP (ORDER BY ' || quote_ident(current_col ) || ' ) -- 计算分位数对应的数值
FROM cust_fdss_calc_fzld_s2
where view_id =''' || current_view_id || '''
) percent
) a
;' ;
EXECUTE sqlstring ;
END;
$BODY$
LANGUAGE plpgsql;
================= 完整存储过程 ==================
CREATE OR REPLACE FUNCTION cust_fdss_calc_fzld_group_fun_step2(current_view_id varchar,current_col varchar) RETURNS void AS
$BODY$
Declare sqlstring varchar ;
BEGIN
-- 删除历史数据
EXECUTE 'delete from cust_fdss_calc_fzld_branch_percent where view_id =''' || current_view_id || ''' and branch=''' || current_col || ''';' ;
-- step 4.1.1 计算分位数对应的数值
sqlstring:='insert into cust_fdss_calc_fzld_branch_percent
select
array_agg(ix) as percent -- 将序列值转成数组 ,并计算数值的最大值
,max(ix) as max_percent
, ''' || current_view_id || '''::varchar as view_id
, ''' || current_col || '''::varchar as branch
from (
select generate_series(cast(percentile_disc[1]*100 as integer),cast(percentile_disc[2]*100 as integer), 1)/100::numeric as ix -- 根据数值的最小和最大值生成序列值
from ( SELECT percentile_disc(ARRAY[0.005,0.995]) WITHIN GROUP (ORDER BY "' || current_col || '") -- 计算分位数对应的数值
FROM cust_fdss_calc_fzld_s2
where view_id =''' || current_view_id || '''
) percent
) a
;' ;
EXECUTE sqlstring ;
-- step 4.1.2 对预估赔付率进行分组
-- EXECUTE 'drop table if exists cust_fdss_calc_fzld_branch_group ;' ;
-- EXECUTE 'delete from cust_fdss_calc_fzld_branch_group where view_id =''' || current_view_id || ''' ;' ;
EXECUTE 'delete from cust_fdss_calc_fzld_branch_group where view_id =''' || current_view_id || ''' and branch=''' || current_col || ''';' ;
sqlstring:='insert into cust_fdss_calc_fzld_branch_group
select
a.view_id
,policy_id
,coins_premium_ply
,b.branch as branch -- yugu_pfl
,' || current_col || '
,case
-- when yugu_pfl < percent [01 ] then percent[01 ] -- 剔除第一分组的人
when ' || current_col || ' <= percent [02 ] then percent[02 ]
when ' || current_col || ' <= percent [03 ] then percent[03 ]
when ' || current_col || ' <= percent [04 ] then percent[04 ]
when ' || current_col || ' <= percent [05 ] then percent[05 ]
when ' || current_col || ' <= percent [06 ] then percent[06 ]
when ' || current_col || ' <= percent [07 ] then percent[07 ]
when ' || current_col || ' <= percent [08 ] then percent[08 ]
when ' || current_col || ' <= percent [09 ] then percent[09 ]
when ' || current_col || ' <= percent [10 ] then percent[10 ]
when ' || current_col || ' <= percent [11 ] then percent[11 ]
when ' || current_col || ' <= percent [12 ] then percent[12 ]
when ' || current_col || ' <= percent [13 ] then percent[13 ]
when ' || current_col || ' <= percent [14 ] then percent[14 ]
when ' || current_col || ' <= percent [15 ] then percent[15 ]
else max_percent end as branch_g -- 此处将分组分组加1
from cust_fdss_calc_fzld_s2 a join cust_fdss_calc_fzld_branch_percent b on 1=1 and a.view_id=b.view_id and b.branch=''' || current_col || '''
where a.view_id =''' || current_view_id || '''
-- limit 90
;' ;
EXECUTE sqlstring ;
-- 4.1.3 预估赔付率 分组统计
EXECUTE 'delete from cust_fdss_calc_fzld_branch_static where view_id =''' || current_view_id || ''' and branch=''' || current_col || ''';' ;
sqlstring:='insert into cust_fdss_calc_fzld_branch_static
select
view_id
,branch
,branch_g
,row_number() over(order by branch_g ) as rw
,sum(coins_premium_ply) as coins_premium_ply
,count(1) as g_cnt
from cust_fdss_calc_fzld_branch_group
where branch_g is not null and view_id=''' || current_view_id || ''' and branch=''' || current_col || '''
group by view_id ,branch ,branch_g
; ' ;
EXECUTE sqlstring ;
END;
$BODY$
LANGUAGE plpgsql;
# 后期采用json形式进行优化后,效率提升50%,记录下
delete from tmp_cust_fdss_calc_fzld_branch_percent where view_id='4049fa36b4cb4b39a7d2dfa900a95ec2';
insert into tmp_cust_fdss_calc_fzld_branch_percent
select
concat('{', array_to_string( array_agg(v),',') ,'}' )::json as percents
, min(ix) as min_branch_value
, max(ix) as max_branch_value
,'yugu_pfl'::varchar as branch
,'4049fa36b4cb4b39a7d2dfa900a95ec2'::VARCHAR as view_id
from (
SELECT concat('"',cast(round(ix,2)*1 as numeric)::text ,'":',round(ix,2)::text,'') as v ,ix
from(
select generate_series(cast(PERCENTILE_CONT[1]*100 as integer),cast(PERCENTILE_CONT[2]*100 as integer), 1)/100::numeric as ix -- 根据数值的最小和最大值生成序列值
from (
SELECT PERCENTILE_CONT(ARRAY[0.005,0.995]) WITHIN GROUP (ORDER BY yugu_pfl ) -- 计算分位数对应的数值
FROM cust_fdss_calc
where view_id = '01e5bc65ac054e8b8c22ef84b84cc237'
) percent
)aa
) b
;
select
a.view_id
,policy_id
,calc_coins_premium_ply as coins_premium_ply
,b.branch as branch
,yugu_pfl as branch_value
, case
when yugu_pfl <= min_branch_value then min_branch_value::varchar -- 剔除第一分组的人
when yugu_pfl >= max_branch_value then max_branch_value::varchar -- 剔除第一分组的人
else percents->>(round(cast(yugu_pfl as numeric) ,2))::varchar end as branch_g
-- percents->>(round(cast(yugu_pfl as numeric) ,2))::varchar as branch_g
from cust_fdss_calc a
join tmp_cust_fdss_calc_fzld_branch_percent b on 1=1 and a.view_id=b.view_id and b.branch= 'yugu_pfl'
where a.view_id ='4049fa36b4cb4b39a7d2dfa900a95ec2'
-- limit 10
;