对1.7亿的有效OFFER,按照11个不同的GROUP BY SET组合分组,每个组合按照6个不同的字段做COUNT DISTINCT以及COUNT;
使用GREENPLUM MAP REDUCE解决GROUP BY DISTINCT GROUP BY SET的问题
之前单纯用SQL实现,在9个节点的GP测试环境中,耗时4万秒;
用GP MAP REDUCE 重写JOB,在同样的环境中整个过程耗时1200秒左右(20分钟);
默认的GREENPLUM GROUP BY DISTINCT GROUP SETS
INSERT INTO xxx (
yyyymmdd
,offer_category_id
,offer_category_id_1
,offer_category_id_2
,offer_category_id_3
,member_city_id
,offer_score_level
,seller_active_level
,grouping_src
,valid_sale_offer_cnt
,valid_smart_offer_cnt
,new_sale_offer_cnt
,repost_sale_offer_cnt
,new_smart_offer_cnt
,repost_smart_offer_cnt
,sale_offer_memb_cnt
,smart_offer_memb_cnt
,new_sale_memb_cnt
,repost_sale_memb_cnt
,new_smart_memb_cnt
,repost_smart_memb_cnt
)
SELECT /*+ parallel(a */
‘20100516′::DATE
,offer_category_id
,offer_category_id_1
,offer_category_id_2
,NULL
,member_city_id
,offer_score_level
,seller_active_level
,1
,COUNT(CASE WHEN is_valid_sale_offer = ‘Y’ THEN offer_id ELSE NULL END) AS valid_sale_offer_cnt
,COUNT(CASE WHEN is_valid_smart_offer = ‘Y’ THEN offer_id ELSE NULL END) AS valid_smart_offer_cnt
,COUNT(CASE WHEN is_new_sale_offer = ‘Y’ THEN offer_id ELSE NULL END) AS new_sale_offer_cnt
,COUNT(CASE WHEN is_repost_sale_offer = ‘Y’ THEN offer_id ELSE NULL END) AS repost_sale_offer_cnt
,COUNT(CASE WHEN is_new_smart_offer = ‘Y’ THEN offer_id ELSE NULL END) AS new_smart_offer_cnt
,COUNT(CASE WHEN is_repost_smart_offer = ‘Y’ THEN offer_id ELSE NULL END) AS repost_smart_offer_cnt
,COUNT(distinct CASE WHEN is_valid_sale_offer = ‘Y’ THEN member_id ELSE NULL END) AS sale_offer_memb_cnt
,COUNT(distinct CASE WHEN is_valid_smart_offer = ‘Y’ THEN member_id ELSE NULL END) AS smart_offer_memb_cnt
,COUNT(distinct CASE WHEN is_new_sale_offer = ‘Y’ THEN member_id ELSE NULL END) AS new_sale_memb_cnt
,COUNT(distinct CASE WHEN is_repost_sale_offer = ‘Y’ THEN member_id ELSE NULL END) AS repost_sale_memb_cnt
,COUNT(distinct CASE WHEN is_new_smart_offer = ‘Y’ THEN member_id ELSE NULL END) AS new_smart_memb_cnt
,COUNT(distinct CASE WHEN is_repost_smart_offer = ‘Y’ THEN member_id ELSE NULL END) AS repost_smart_memb_cnt
FROM xxx a
GROUP BY GROUPing sets
(
(
offer_category_id_1
,member_city_id
,seller_active_level
,offer_score_level
),
(
offer_category_id_2
,member_city_id
,seller_active_level
,offer_score_level
),
(
offer_category_id
,member_city_id
,seller_active_level
,offer_score_level
),
(
offer_category_id_1
,member_city_id
,seller_active_level
),
(
offer_category_id_2
,member_city_id
,seller_active_level
),
(
offer_category_id
,member_city_id
,seller_active_level
),
(
member_city_id
,seller_active_level
,offer_score_level
),
(
member_city_id
,seller_active_level
),
(
offer_category_id_1
,offer_score_level
),
(
offer_category_id_2
,offer_score_level
),
(
offer_category_id
,offer_score_level)
);
查询成功: 共计 …行受到影响,耗时: 40572505 毫秒(ms)。
gpadmin@hadoop1:/home/gpadmin/yml>./a.sh
TRUNCATE TABLE
mapreduce_26720_run_1
DONE
real 16m14.025s
user 0m0.003s
sys 0m0.001s
select count(1) from mp_result
aligputf8=# select count(1) from mp_result;
count
———–
653862840
(1 row)
Time: 9151.286 ms
aligputf8=# create table rd_result_temp with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN)
aligputf8-# as
aligputf8-# select key1,
aligputf8-# key2,
aligputf8-# key3,
aligputf8-# key4,
aligputf8-# group_set,
aligputf8-# value_type,
aligputf8-# value,
aligputf8-# count(1) as cnt_offer
aligputf8-# from mp_result
aligputf8-# group by key1, key2, key3, key4, group_set, value_type, value
aligputf8-# distributed by (key1,key2,key3,key4);
SELECT 34098099
Time: 112664.701 ms
aligputf8=# create table res_t1 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key1, key2, key3, key4,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count(case when value_type=1 then value else null end) as type1count
aligputf8-# ,count(case when value_type=2 then value else null end) as type2count
aligputf8-# ,count(case when value_type=3 then value else null end) as type3count
aligputf8-# from rd_result_temp
aligputf8-# group by (key1, key2, key3, key4,group_set)
aligputf8-# distributed by(key1, key2, key3, key4);
Time: 8616.959 ms
aligputf8=# create table res_t2 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key1, key2, key3,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count(case when value_type=1 then value else null end) as type1count
aligputf8-# ,count(case when value_type=2 then value else null end) as type2count
aligputf8-# ,count(case when value_type=3 then value else null end) as type3count
aligputf8-# from (select key1,
aligputf8(# key2,
aligputf8(# key3,
aligputf8(# group_set,
aligputf8(# value_type,
aligputf8(# value,
aligputf8(# sum(cnt_offer) as cnt_offer
aligputf8(# from rd_result_temp
aligputf8(# group by key1, key2, key3, group_set, value_type, value) as a
aligputf8-# group by (key1, key2, key3, group_set)
aligputf8-# distributed by(key1, key2, key3);
Time: 22556.645 ms
aligputf8=# create table res_t3 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key1, key4,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count(case when value_type=1 then value else null end) as type1count
aligputf8-# ,count(case when value_type=2 then value else null end) as type2count
aligputf8-# ,count(case when value_type=3 then value else null end) as type3count
aligputf8-# from (select key1,
aligputf8(# key4,
aligputf8(# group_set,
aligputf8(# value_type,
aligputf8(# value,
aligputf8(# sum(cnt_offer) as cnt_offer
aligputf8(# from rd_result_temp
aligputf8(# group by key1, key4, group_set, value_type, value) as a
aligputf8-# group by (key1, key4, group_set)
aligputf8-# distributed by(key1, key4);
Time: 19043.426 ms
aligputf8=# create table res_t4 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key2, key3,key4,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count( case when value_type=1 then value else null end) as type1count
aligputf8-# ,count( case when value_type=2 then value else null end) as type2count
aligputf8-# ,count( case when value_type=3 then value else null end) as type3count
aligputf8-# from (select key2,
aligputf8(# key3,
aligputf8(# key4,
aligputf8(# group_set,
aligputf8(# value_type,
aligputf8(# value,
aligputf8(# sum(cnt_offer) as cnt_offer
aligputf8(# from rd_result_temp
aligputf8(# where group_set=’G3′
aligputf8(# group by key2, key3,key4, group_set, value_type, value) as a
aligputf8-# group by(key2, key3,key4, group_set)
aligputf8-# distributed by(key2, key3,key4);
SELECT 15527
Time: 5547.497 ms
aligputf8=# create table res_t5 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key2, key3
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count( case when value_type=1 then value else null end) as type1count
aligputf8-# ,count( case when value_type=2 then value else null end) as type2count
aligputf8-# ,count( case when value_type=3 then value else null end) as type3count
aligputf8-# from (select key2,
aligputf8(# key3,
aligputf8(# value_type,
aligputf8(# value,
aligputf8(# sum(cnt_offer) as cnt_offer
aligputf8(# from rd_result_temp
aligputf8(# where group_set=’G3′
aligputf8(# group by key2, key3, value_type, value) as a
aligputf8-# group by(key2, key3)
aligputf8-# distributed by(key2, key3);
Time: 4551.675 ms
aligputf8=#
%YAML 1.1
—
VERSION: 1.0.0.1
DATABASE: aligputf8
USER: gpadmin
DEFINE:
- INPUT:
NAME: offer
TABLE: t2
- OUTPUT:
NAME: mp
TABLE: mp_result
MODE: APPEND
#drop table mp_result;
#create table mp_result1(key1 text,key2 text,key3 text,key4 text,group_set text,value_type text,value text) with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) DISTRIBUTED by(key1,key2,key3,key4);
- MAP:
NAME: map_offer
LANGUAGE: python
FUNCTION: |
key_group1 = [offer_category_id_1, member_city_id, seller_active_level, offer_score_level, 'G1']
key_group2 = [offer_category_id_2, member_city_id, seller_active_level, offer_score_level, 'G2']
key_group3 = [offer_category_id, member_city_id, seller_active_level, offer_score_level, 'G3']
#key_group4 = [offer_category_id_1, member_city_id, seller_active_level, '', 'G4']
#key_group5 = [offer_category_id_2, member_city_id, seller_active_level, '', 'G5']
#key_group6 = [offer_category_id, member_city_id, seller_active_level, '', 'G6']
#key_group7 = [offer_category_id_1, offer_score_level, '', '', 'G7']
#key_group8 = [offer_category_id_2, offer_score_level, '', '', 'G8']
#key_group9 = [offer_category_id, offer_score_level, '', '', 'G9']
#key_group10= [member_city_id, seller_active_level, offer_score_level, '', 'G10']
#key_group11= [member_city_id, seller_active_level, '', '', 'G11']
key_sets=[key_group1,key_group2,key_group3]
flags =[is_valid_sale_offer, is_valid_smart_offer,is_new_sale_offer,is_repost_sale_offer,is_new_smart_offer,is_repost_smart_offer]
for index,flag in enumerate(flags):
if flag == ‘Y’:
for key_set in key_sets:
yield[key_set[0],key_set[1],key_set[2],key_set[3],key_set[4],index+1,member_id]
OPTIMIZE: STRICT IMMUTABLE
PARAMETERS:
- offer_category_id text
- offer_category_id_1 text
- offer_category_id_2 text
- offer_category_id_3 text
- member_city_id text
- offer_score_level text
- seller_active_level text
- offer_id text
- is_valid_sale_offer text
- is_valid_smart_offer text
- is_new_sale_offer text
- is_repost_sale_offer text
- is_new_smart_offer text
- is_repost_smart_offer text
- member_id text
RETURNS:
- key1 text
- key2 text
- key3 text
- key4 text
- group_set text
- value_type text
- value text
EXECUTE:
- RUN:
SOURCE: offer
MAP: map_offer
TARGET: mp
#