已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount。请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57。
1)给出 2017年每个月的订单数、用户数、总成交金额。
2)给出2017年11月的新客数(指在11月才有第一笔订单)
建表语句
create table second_order(`Date` String,Order_id String,User_id String,amount double);
样例数据
同一个用户,相同月份
insert into table second_order values (‘2017-01-01’,‘10029028’,‘1000003251’,33.57);
insert into table second_order values (‘2017-01-01’,‘10029029’,‘1000003251’,33.57);
不同用户,相同月份
insert into table second_order values (‘2017-01-01’,‘100290288’,‘1000003252’,33.57);
不同月份
insert into table second_order values (‘2017-02-02’,‘10029088’,‘1000003251’,33.57);
insert into table second_order values (‘2017-02-02’,‘100290281’,‘1000003251’,33.57);
insert into table second_order values (‘2017-02-02’,‘100290282’,‘1000003253’,33.57);
insert into table second_order values (‘2017-11-02’,‘10290282’,‘100003253’,234);
insert into table second_order values (‘2017-11-02’,‘10290282’,‘100003243’,234);
1、实现的sql语句
select
a.month,
a.ordercount,
b.uc,
a.totalcount
from
(select
tmp2.month,
count(1) ordercount,
sum(amount) totalcount
from
(
select
from_unixtime(unix_timestamp(`Date`,'yyyy-mm-dd'),'yyyy-mm') month,
order_id,
user_id,
amount
from orders
)tmp2
group by month
)a
join
(
select
tmp1.month,
count(1) uc
from (
select
month,
order_id,
user_id,
row_number() over(partition by tmp.month,tmp.user_id order by tmp.amount)con
from (
select
from_unixtime(unix_timestamp(`Date`,'yyyy-mm-dd'),'yyyy-mm') month,
order_id,
user_id,
amount
from orders
) tmp
)tmp1
where tmp1.con=1
group by tmp1.month
)b
on a.month=b.month
简便写法
select
substring(`Date`,1,7) month,
count(order_id),
count(DISTINCT(user_id)),
sum(amount) totalcount
from orders
where substring(`Date`,1,4) ='2017'
group by substring(`Date`,1,7);
但是这种简便的写法,在数据巨多的情况下,恐怕会发生数据倾斜!尽量实际的生产过程中不要使用distinct,这样会把相同user_id shuffle到同一个reduce里。这就会造成数据倾斜。采用上面的思路把用户数拆分出来!最后join。
第一步采用group by是去重。
select
substring(`Date`,1,7) month,
user_id
from orders
group by substring(`Date`,1,7),user_id;
结果是:
month user_id
2017-01 1000003251
2017-01 1000003252
2017-02 1000003251
2017-02 1000003253
2017-11 100003243
2017-11 100003253
select
a.month,
count(a.user_id)
from
(
select
substring(`Date`,1,7) month,
user_id
from orders
group by substring(`Date`,1,7),user_id
) a
group by a.month;
最终结果为
select
a.month,
a.ordercount,
b.usercount,
a.totalcount
from
(
select
substring(`Date`,1,7) month,
count(order_id) ordercount,
sum(amount) totalcount
from orders
where substring(`Date`,1,4) ='2017'
group by substring(`Date`,1,7)
)a
join
(
select
c.month,
count(c.user_id) usercount
from
(
select
substring(`Date`,1,7) month,
user_id
from orders
group by substring(`Date`,1,7),user_id
) c
group by c.month
)b
on a.mount=b.month;
第二种方式去重
select
substring(Date
,1,7) month,
user_id,
row_number() over(partition by substring(Date
,1,7),user_id order by amount) con
from orders
结果是:
Total MapReduce CPU Time Spent: 6 seconds 550 msec
OK
month user_id con
2017-01 1000003251 1
2017-01 1000003251 2
2017-01 1000003252 1
2017-02 1000003251 1
2017-02 1000003251 2
2017-02 1000003253 1
2017-11 100003243 1
2017-11 100003253 1
最终的sql不在写了!(自我补充)
2、小题
select
count(*) as total_num
from
(
select
user_id
from
(
select
user_id,
`date`,
row_number() over(partition by user_id order by `date` desc) shop_count
from second_order
) t1
where subString(`date`,1,7)='2017-11' and shop_count=1
) t2;
结果为:
total_num
2
Time taken: 17.826 seconds, Fetched: 1 row(s)