需求描述
- 业务数据有30条,假设已按照某列升序规则排序:data_id:1~30;
- 计划等距抽样其中5条数据:seq_id:1~5;
- 抽样结果示例如下:
sql算法实现
/*
2019-06-30 00:14:04 @hury
等距抽样算法模拟-通用
*/
with tb_expect_seq(seq_id) as /*预期抽取结果的序号列表:假设预期抽取5条数据*/
(select level as seq_id from dual connect by level <= 5),
tb_data(data_id,name) as /*模拟业务数据:假设业务数据有30条*/
(select level as data_id,
'data:' || cast(level as varchar(10)) as name
from dual
connect by level <= 30),
tb_factor(factor) as /*生成抽取因子=业务数据总量/抽取结果总量*/
(select a.sl / b.sl as factor
from (select count(data_id) as sl from tb_data) a,
(select count(seq_id) as sl from tb_expect_seq) b),
tb_data_id(seq_id,data_id) as /*计算需要抽取的数据主键列表*/
(select distinct a.seq_id, round(a.seq_id * b.factor) as data_id
from tb_expect_seq a, tb_factor b)
/*查询结果*/
select b.seq_id, a.data_id, a.name
from tb_data a, tb_data_id b
where a.data_id = b.data_id
order by 1,2;
–END–