group by cumsum sql ,pandas,saprk,tableau
直接干不就完了,造数据:
drop table if exists group_by_cumsum;
create table if not exists group_by_cumsum(create_time datetime,city VarChar(20),sales bigint);
insert into group_by_cumsum values('2018-09-01','上海',1),
('2018-10-01','上海',3),
('2018-11-01','上海',2),
('2018-10-01','上海',3),
('2018-11-01','上海',2),
('2018-09-01','北京',1),
('2018-10-01','北京',2),
('2018-11-01','北京',4),
('2018-09-01','纽约',1),
('2018-10-01','纽约',1),
('2018-11-01','纽约',2);
select * from group_by_cumsum order by city,create_time;
1.普通sql
select
create_month,
city,
total_sales,
sum(total_sales) over(partition by city rows between unbounded preceding and current row)
from(
select
date_format(create_time,"%y-%m") as create_month,
city,
sum(sales) as total_sales
from group_by_cumsum
GROUP BY date_format(create_time,"%y-%m"),city
order by city,date_format(create_time,"%y-%m")
)t1
2.开窗on group by
select
date_format(create_time,"%y-%m") as create_month,
city,
sum(sum(sales)) over(partition by city order by date_format(create_time,"%y-%m") asc)
from group_by_cumsum
GROUP BY date_format(create_time,"%y-%m"),city
order by city,date_format(create_time,"%y-%m")
3.pandas
df.groupby(['date','city']).agg('sales':cumsum)
4.spark
#pyspark group by cumsum
from pyspark.sql import Window
from pyspark.sql import functions as F
windowval = (Window.partitionBy('class').orderBy('time').rangeBetween(Window.unboundedPreceding, 0))
df_w_cumsum = df.withColumn('cum_sum', F.sum('value').over(windowval))
df_w_cumsum.show()
5.tableau