使用GP MAP REDUCE解决GROUP BY SET DISTINCT

对1.7亿的有效OFFER,按照11个不同的GROUP BY SET组合分组,每个组合按照6个不同的字段做COUNT DISTINCT以及COUNT;
使用GREENPLUM MAP REDUCE解决GROUP BY DISTINCT GROUP BY SET的问题
之前单纯用SQL实现,在9个节点的GP测试环境中,耗时4万秒;
用GP MAP REDUCE 重写JOB,在同样的环境中整个过程耗时1200秒左右(20分钟);

 

默认的GREENPLUM GROUP BY DISTINCT GROUP SETS
INSERT INTO xxx                (
yyyymmdd
,offer_category_id
,offer_category_id_1
,offer_category_id_2
,offer_category_id_3
,member_city_id
,offer_score_level
,seller_active_level
,grouping_src
,valid_sale_offer_cnt
,valid_smart_offer_cnt
,new_sale_offer_cnt
,repost_sale_offer_cnt
,new_smart_offer_cnt
,repost_smart_offer_cnt
,sale_offer_memb_cnt
,smart_offer_memb_cnt
,new_sale_memb_cnt
,repost_sale_memb_cnt
,new_smart_memb_cnt
,repost_smart_memb_cnt
)
SELECT /*+ parallel(a 8) */
‘20100516′::DATE
,offer_category_id
,offer_category_id_1
,offer_category_id_2
,NULL
,member_city_id
,offer_score_level
,seller_active_level
,1
,COUNT(CASE WHEN is_valid_sale_offer = ‘Y’   THEN offer_id ELSE NULL END) AS valid_sale_offer_cnt
,COUNT(CASE WHEN is_valid_smart_offer = ‘Y’  THEN offer_id ELSE NULL END) AS valid_smart_offer_cnt
,COUNT(CASE WHEN is_new_sale_offer = ‘Y’     THEN offer_id ELSE NULL END) AS new_sale_offer_cnt
,COUNT(CASE WHEN is_repost_sale_offer = ‘Y’  THEN offer_id ELSE NULL END) AS repost_sale_offer_cnt
,COUNT(CASE WHEN is_new_smart_offer = ‘Y’    THEN offer_id ELSE NULL END) AS new_smart_offer_cnt
,COUNT(CASE WHEN is_repost_smart_offer = ‘Y’ THEN offer_id ELSE NULL END) AS repost_smart_offer_cnt
,COUNT(distinct CASE WHEN is_valid_sale_offer = ‘Y’    THEN member_id ELSE NULL END) AS sale_offer_memb_cnt
,COUNT(distinct CASE WHEN is_valid_smart_offer = ‘Y’   THEN member_id ELSE NULL END) AS smart_offer_memb_cnt
,COUNT(distinct CASE WHEN is_new_sale_offer = ‘Y’      THEN member_id ELSE NULL END) AS new_sale_memb_cnt
,COUNT(distinct CASE WHEN is_repost_sale_offer = ‘Y’   THEN member_id ELSE NULL END) AS repost_sale_memb_cnt
,COUNT(distinct CASE WHEN is_new_smart_offer = ‘Y’     THEN member_id ELSE NULL END) AS new_smart_memb_cnt
,COUNT(distinct CASE WHEN is_repost_smart_offer = ‘Y’  THEN member_id ELSE NULL END) AS repost_smart_memb_cnt
FROM        xxx a
GROUP BY GROUPing sets
(
(
offer_category_id_1
,member_city_id
,seller_active_level
,offer_score_level
),
(
offer_category_id_2
,member_city_id
,seller_active_level
,offer_score_level
),
(
offer_category_id
,member_city_id
,seller_active_level
,offer_score_level
),
(
offer_category_id_1
,member_city_id
,seller_active_level
),
(
offer_category_id_2
,member_city_id
,seller_active_level
),
(
offer_category_id
,member_city_id
,seller_active_level
),
(
member_city_id
,seller_active_level
,offer_score_level
),
(
member_city_id
,seller_active_level
),
(
offer_category_id_1
,offer_score_level
),
(
offer_category_id_2
,offer_score_level
),
(
offer_category_id
,offer_score_level)
);

查询成功: 共计 …行受到影响,耗时: 40572505 毫秒(ms)。

gpadmin@hadoop1:/home/gpadmin/yml>./a.sh
TRUNCATE TABLE
mapreduce_26720_run_1
DONE
real    16m14.025s
user    0m0.003s
sys     0m0.001s
select count(1) from mp_result
aligputf8=# select count(1) from mp_result;
count
———–
653862840
(1 row)
Time: 9151.286 ms
aligputf8=# create table rd_result_temp with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN)
aligputf8-#  as
aligputf8-#  select        key1,
aligputf8-#                key2,
aligputf8-#                key3,
aligputf8-#                key4,
aligputf8-#                group_set,
aligputf8-#                value_type,
aligputf8-#                value,
aligputf8-#                count(1) as cnt_offer
aligputf8-#           from mp_result
aligputf8-#          group by key1, key2, key3, key4, group_set, value_type, value
aligputf8-#  distributed by (key1,key2,key3,key4);
SELECT 34098099
Time: 112664.701 ms
aligputf8=# create table res_t1 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key1, key2, key3, key4,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count(case when value_type=1 then value else null end) as type1count
aligputf8-# ,count(case when value_type=2 then value else null end) as type2count
aligputf8-# ,count(case when value_type=3 then value else null end) as type3count
aligputf8-# from rd_result_temp
aligputf8-# group by (key1, key2, key3, key4,group_set)
aligputf8-# distributed by(key1, key2, key3, key4);
Time: 8616.959 ms
aligputf8=# create table res_t2 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key1, key2, key3,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count(case when value_type=1 then value else null end) as type1count
aligputf8-# ,count(case when value_type=2 then value else null end) as type2count
aligputf8-# ,count(case when value_type=3 then value else null end) as type3count
aligputf8-#   from (select key1,
aligputf8(#                key2,
aligputf8(#                key3,
aligputf8(#                group_set,
aligputf8(#                value_type,
aligputf8(#                value,
aligputf8(#                sum(cnt_offer) as cnt_offer
aligputf8(#           from rd_result_temp
aligputf8(#          group by key1, key2, key3, group_set, value_type, value) as a
aligputf8-#  group by (key1, key2, key3, group_set)
aligputf8-# distributed by(key1, key2, key3);
Time: 22556.645 ms
aligputf8=# create table res_t3 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key1, key4,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count(case when value_type=1 then value else null end) as type1count
aligputf8-# ,count(case when value_type=2 then value else null end) as type2count
aligputf8-# ,count(case when value_type=3 then value else null end) as type3count
aligputf8-#   from (select key1,
aligputf8(#                key4,
aligputf8(#                group_set,
aligputf8(#                value_type,
aligputf8(#                value,
aligputf8(#                sum(cnt_offer) as cnt_offer
aligputf8(#           from rd_result_temp
aligputf8(#          group by key1, key4, group_set, value_type, value) as a
aligputf8-#  group by (key1, key4, group_set)
aligputf8-# distributed by(key1, key4);
Time: 19043.426 ms
aligputf8=# create table res_t4 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key2, key3,key4,group_set
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count( case when value_type=1 then value else null end) as type1count
aligputf8-# ,count( case when value_type=2 then value else null end) as type2count
aligputf8-# ,count( case when value_type=3 then value else null end) as type3count
aligputf8-#   from (select key2,
aligputf8(#                key3,
aligputf8(#                key4,
aligputf8(#                group_set,
aligputf8(#                value_type,
aligputf8(#                value,
aligputf8(#                sum(cnt_offer) as cnt_offer
aligputf8(#           from rd_result_temp
aligputf8(#           where group_set=’G3′
aligputf8(#          group by key2, key3,key4, group_set, value_type, value) as a
aligputf8-#  group by(key2, key3,key4, group_set)
aligputf8-#  distributed by(key2, key3,key4);
SELECT 15527
Time: 5547.497 ms
aligputf8=# create table res_t5 with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) as
aligputf8-# select key2, key3
aligputf8-# ,sum(case when value_type=1 then cnt_offer else null end) as type1sum
aligputf8-# ,sum(case when value_type=2 then cnt_offer else null end) as type2sum
aligputf8-# ,sum(case when value_type=3 then cnt_offer else null end) as type3sum
aligputf8-# ,count( case when value_type=1 then value else null end) as type1count
aligputf8-# ,count( case when value_type=2 then value else null end) as type2count
aligputf8-# ,count( case when value_type=3 then value else null end) as type3count
aligputf8-#   from (select key2,
aligputf8(#                key3,
aligputf8(#                value_type,
aligputf8(#                value,
aligputf8(#                sum(cnt_offer) as cnt_offer
aligputf8(#           from rd_result_temp
aligputf8(#           where group_set=’G3′
aligputf8(#          group by key2, key3, value_type, value) as a
aligputf8-#  group by(key2, key3)
aligputf8-#  distributed by(key2, key3);
Time: 4551.675 ms
aligputf8=#
%YAML 1.1

VERSION:         1.0.0.1
DATABASE:        aligputf8
USER:            gpadmin
DEFINE:
- INPUT:
NAME:   offer
TABLE:  t2
- OUTPUT:
NAME:   mp
TABLE:  mp_result
MODE:   APPEND
#drop table mp_result;
#create table mp_result1(key1 text,key2 text,key3 text,key4 text,group_set text,value_type text,value text) with(appendonly=true,compresslevel=5,ORIENTATION=COLUMN) DISTRIBUTED by(key1,key2,key3,key4);
- MAP:
NAME:      map_offer
LANGUAGE:  python
FUNCTION:  |
key_group1 = [offer_category_id_1, member_city_id,      seller_active_level, offer_score_level,   'G1']
key_group2 = [offer_category_id_2, member_city_id,      seller_active_level, offer_score_level,   'G2']
key_group3 = [offer_category_id,   member_city_id,      seller_active_level, offer_score_level,   'G3']
#key_group4 = [offer_category_id_1, member_city_id,      seller_active_level, '',                  'G4']
#key_group5 = [offer_category_id_2, member_city_id,      seller_active_level, '',                  'G5']
#key_group6 = [offer_category_id,   member_city_id,      seller_active_level, '',                  'G6']
#key_group7 = [offer_category_id_1, offer_score_level,   '',                  '',                  'G7']
#key_group8 = [offer_category_id_2, offer_score_level,   '',                  '',                  'G8']
#key_group9 = [offer_category_id,   offer_score_level,   '',                  '',                  'G9']
#key_group10= [member_city_id,      seller_active_level, offer_score_level,   '',                  'G10']
#key_group11= [member_city_id,      seller_active_level, '',                  '',                  'G11']
key_sets=[key_group1,key_group2,key_group3]
flags =[is_valid_sale_offer, is_valid_smart_offer,is_new_sale_offer,is_repost_sale_offer,is_new_smart_offer,is_repost_smart_offer]
for index,flag in enumerate(flags):
if flag == ‘Y’:
for key_set in key_sets:
yield[key_set[0],key_set[1],key_set[2],key_set[3],key_set[4],index+1,member_id]
OPTIMIZE:   STRICT IMMUTABLE
PARAMETERS:
- offer_category_id     text
- offer_category_id_1   text
- offer_category_id_2   text
- offer_category_id_3   text
- member_city_id        text
- offer_score_level     text
- seller_active_level   text
- offer_id              text
- is_valid_sale_offer   text
- is_valid_smart_offer  text
- is_new_sale_offer     text
- is_repost_sale_offer  text
- is_new_smart_offer    text
- is_repost_smart_offer text
- member_id             text
RETURNS:
- key1 text
- key2 text
- key3 text
- key4 text
- group_set text
- value_type text
- value text
EXECUTE:
- RUN:
SOURCE:    offer
MAP:       map_offer
TARGET:    mp
#

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值