------ 特点:效率高
------ table(1) 数据全集
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020 as
select a.*,b.city
from
(select *
from databasename.CiiFcst_hotel_Cii_basetable
where d='2016-08-29') a
inner join
(select distinct masterhotel, city from databasename.CiiFrcst_Hotel_Info ---有重复的数据
where d='2016-10-19') b on a.hotelid=b.masterhotel
where b.city in (375
,28
,258
,38
,660
,3996
,3997
,4004
,4029
,4125
,4130
,4131
,4137
,4139
,4144);
----- 数据行数:261051
--- select hotelid, count(*)
--- from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020
--- group by hotelid;
------ 各组均为31
------ table (2.1) 关键表1
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 as
select a.*,
rank()over(partition by hotelid order by starttime asc) rank ----- 用row_num()会导致重复,为何?
---,sum(notcancelcii)over(partition by hotelid) /sum(notcancelcii)over() r1, round(100000*notcancelcii/sum(notcancelcii)over()) cnt
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020 a;
------ table (2.2)
--use databasename;
--drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1;
--create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 as
--select a.*,
---sum(notcancelcii)over(partition by hotelid) /sum(notcancelcii)over() r1, round(100000*notcancelcii/sum(notcancelcii)over()) cnt --语句有误
--from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_0 a;
------ table (3) 获得每个酒店抽样个数
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2 as
select hotelid, cnt, cnt/sum(cnt)over() r3, round(2*100000*cnt/sum(cnt)over()) sampling_num
from
(select hotelid, sum(notcancelcii) cnt
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020
group by hotelid) a;
------- sampling_num 最大值
----- table (4) 生成酒店随机序列表 关键表2
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3 as
select hotelid, cnt, r3, sampling_num, b.x seq, int(floor(rand()*31)+1) rand_value ----注意:值类型不一致的问题
from
(select x, 1 label from databasename.ym_dual where x<=2*621) b ----添加过滤,及调换顺序,效率大幅提升. databasename.ym_dual 类似与oracle的dual表
inner join
(select hotelid, cnt, r3, sampling_num, 1 label
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2) a
on b.label=a.label
where a.sampling_num>=b.x;
----- rank取值 1-31
----- 1分钟结束
------ table (5) 两表关联,完成随机抽样
----- 最终表
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_4;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_4 as
select b.*
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3 a
inner join databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 b
on a.hotelid=b.hotelid and a.rand_value=b.rank;
------ 1分钟
------ table(1) 数据全集
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020 as
select a.*,b.city
from
(select *
from databasename.CiiFcst_hotel_Cii_basetable
where d='2016-08-29') a
inner join
(select distinct masterhotel, city from databasename.CiiFrcst_Hotel_Info ---有重复的数据
where d='2016-10-19') b on a.hotelid=b.masterhotel
where b.city in (375
,28
,258
,38
,660
,3996
,3997
,4004
,4029
,4125
,4130
,4131
,4137
,4139
,4144);
----- 数据行数:261051
--- select hotelid, count(*)
--- from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020
--- group by hotelid;
------ 各组均为31
------ table (2.1) 关键表1
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 as
select a.*,
rank()over(partition by hotelid order by starttime asc) rank ----- 用row_num()会导致重复,为何?
---,sum(notcancelcii)over(partition by hotelid) /sum(notcancelcii)over() r1, round(100000*notcancelcii/sum(notcancelcii)over()) cnt
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020 a;
------ table (2.2)
--use databasename;
--drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1;
--create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 as
--select a.*,
---sum(notcancelcii)over(partition by hotelid) /sum(notcancelcii)over() r1, round(100000*notcancelcii/sum(notcancelcii)over()) cnt --语句有误
--from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_0 a;
------ table (3) 获得每个酒店抽样个数
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2 as
select hotelid, cnt, cnt/sum(cnt)over() r3, round(2*100000*cnt/sum(cnt)over()) sampling_num
from
(select hotelid, sum(notcancelcii) cnt
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020
group by hotelid) a;
------- sampling_num 最大值
----- table (4) 生成酒店随机序列表 关键表2
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3 as
select hotelid, cnt, r3, sampling_num, b.x seq, int(floor(rand()*31)+1) rand_value ----注意:值类型不一致的问题
from
(select x, 1 label from databasename.ym_dual where x<=2*621) b ----添加过滤,及调换顺序,效率大幅提升. databasename.ym_dual 类似与oracle的dual表
inner join
(select hotelid, cnt, r3, sampling_num, 1 label
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_2) a
on b.label=a.label
where a.sampling_num>=b.x;
----- rank取值 1-31
----- 1分钟结束
------ table (5) 两表关联,完成随机抽样
----- 最终表
use databasename;
drop table if exists databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_4;
create table databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_4 as
select b.*
from databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_3 a
inner join databasename.Tmp_CiiFcst_hotel_Cii_up_sample_1020_1 b
on a.hotelid=b.hotelid and a.rand_value=b.rank;
------ 1分钟