Hive实现大数据集有放回抽样(方法一)

------ 特点:效率高


------  table(1)  数据全集




use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020;
create  table  databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020 as
select a.*,b.city
from
(select *
from databasename.CiiFcst_hotel_Cii_basetable
where d='2016-08-29') a
inner join 
(select distinct masterhotel, city  from databasename.CiiFrcst_Hotel_Info  ---有重复的数据
where d='2016-10-19') b on a.hotelid=b.masterhotel
where b.city in (375
,28
,258
,38
,660
,3996
,3997
,4004
,4029
,4125
,4130
,4131
,4137
,4139
,4144);




----- 数据行数:261051


--- select hotelid, count(*)
--- from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020
--- group by hotelid;
------   各组均为31




------ table (2.1)  关键表1




use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1;
create  table  databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 as
select a.*, 
rank()over(partition by hotelid order by starttime asc) rank   ----- 用row_num()会导致重复,为何?  
---,sum(notcancelcii)over(partition by hotelid) /sum(notcancelcii)over() r1, round(100000*notcancelcii/sum(notcancelcii)over()) cnt
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020 a;








------ table (2.2)




--use databasename;
--drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1;
--create  table  databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 as
--select a.*, 
---sum(notcancelcii)over(partition by hotelid) /sum(notcancelcii)over() r1, round(100000*notcancelcii/sum(notcancelcii)over()) cnt   --语句有误
--from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_0 a;










------  table (3)  获得每个酒店抽样个数




use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2;
create  table  databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2 as
select hotelid, cnt, cnt/sum(cnt)over() r3, round(2*100000*cnt/sum(cnt)over()) sampling_num
from
(select hotelid, sum(notcancelcii) cnt
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020
group by hotelid) a;


------- sampling_num 最大值






----- table (4) 生成酒店随机序列表  关键表2
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3;
create  table  databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3 as
select  hotelid, cnt, r3, sampling_num, b.x seq, int(floor(rand()*31)+1) rand_value  ----注意:值类型不一致的问题
from
(select x, 1 label  from databasename.ym_dual where x<=2*621) b  ----添加过滤,及调换顺序,效率大幅提升. databasename.ym_dual  类似与oracle的dual表
inner join
(select hotelid, cnt, r3, sampling_num, 1 label 
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2) a
on b.label=a.label 
where a.sampling_num>=b.x;




----- rank取值 1-31
----- 1分钟结束






------ table (5) 两表关联,完成随机抽样
-----  最终表




use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_4;
create  table  databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_4 as
select b.*
from  databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3 a
inner join databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 b
on a.hotelid=b.hotelid and  a.rand_value=b.rank;
------ 1分钟


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值