请查出整个数据集中,那些有过连续3天销售记录的店;
A,2017-10-11,300
A,2017-10-12,200
B,2017-10-11,400
B,2017-10-12,200
A,2017-10-13,100
B,2017-10-15,600
A,2017-10-15,100
C,2017-10-11,350
A,2017-10-16,300
C,2017-10-13,250
A,2017-10-17,150
C,2017-10-14,300
C,2017-10-15,400
A,2017-10-18,340
A,2017-10-19,360
C,2017-10-16,200
D,2017-10-13,500
E,2017-10-14,600
E,2017-10-15,500
D,2017-10-14,600
思路:
按name分组,dt进行排序,得到rank(1,2,3,4…),使用date_sub(string date,int i)获取new_dt=dt-rank的值,
按new_dt进行分组查询,统计count,若cont>=3,则有过连续三天的销售记录
创建表,并导入数据
create table u_homework(name string,dt string,records int) row format delimited fields terminated by ',';
load data local inpath '/root/txt/homework.txt' into table u_homework;
1)按name分组,dt进行排序,得到rank(1,2,3,4…)
select name,dt,records,
row_number() over(partition by name order by dt) as rank
from u_homework;
+-------+-------------+----------+-------+--+
| name | dt | records | rank |
+-------+-------------+----------+-------+--+
| A | 2017-10-11 | 300 | 1 |
| A | 2017-10-12 | 200 | 2 |
| A | 2017-10-13 | 100 | 3 |
| A | 2017-10-15 | 100 | 4 |
| A | 2017-10-16 | 300 | 5 |
| A | 2017-10-17 | 150 | 6 |
| A | 2017-10-18 | 340 | 7 |
| A | 2017-10-19 | 360 | 8 |
| B | 2017-10-11 | 400 | 1 |
| B | 2017-10-12 | 200 | 2 |
| B | 2017-10-15 | 600 | 3 |
| C | 2017-10-11 | 350 | 1 |
| C | 2017-10-13 | 250 | 2 |
| C | 2017-10-14 | 300 | 3 |
| C | 2017-10-15 | 400 | 4 |
| C | 2017-10-16 | 200 | 5 |
| D | 2017-10-13 | 500 | 1 |
| D | 2017-10-14 | 600 | 2 |
| E | 2017-10-14 | 600 | 1 |
| E | 2017-10-15 | 500 | 2 |
+-------+-------------+----------+-------+--+
2)获取new_dt
select name,records,dt,rank,date_sub(dt,rank) as new_dt
from
(select name,dt,records,
row_number() over(partition by name order by dt) as rank
from u_homework) tmp;
+-------+----------+-------------+-------+-------------+--+
| name | records | dt | rank | new_dt |
+-------+----------+-------------+-------+-------------+--+
| A | 300 | 2017-10-11 | 1 | 2017-10-10 |
| A | 200 | 2017-10-12 | 2 | 2017-10-10 |
| A | 100 | 2017-10-13 | 3 | 2017-10-10 |
| A | 100 | 2017-10-15 | 4 | 2017-10-11 |
| A | 300 | 2017-10-16 | 5 | 2017-10-11 |
| A | 150 | 2017-10-17 | 6 | 2017-10-11 |
| A | 340 | 2017-10-18 | 7 | 2017-10-11 |
| A | 360 | 2017-10-19 | 8 | 2017-10-11 |
| B | 400 | 2017-10-11 | 1 | 2017-10-10 |
| B | 200 | 2017-10-12 | 2 | 2017-10-10 |
| B | 600 | 2017-10-15 | 3 | 2017-10-12 |
| C | 350 | 2017-10-11 | 1 | 2017-10-10 |
| C | 250 | 2017-10-13 | 2 | 2017-10-11 |
| C | 300 | 2017-10-14 | 3 | 2017-10-11 |
| C | 400 | 2017-10-15 | 4 | 2017-10-11 |
| C | 200 | 2017-10-16 | 5 | 2017-10-11 |
| D | 500 | 2017-10-13 | 1 | 2017-10-12 |
| D | 600 | 2017-10-14 | 2 | 2017-10-12 |
| E | 600 | 2017-10-14 | 1 | 2017-10-13 |
| E | 500 | 2017-10-15 | 2 | 2017-10-13 |
+-------+----------+-------------+-------+-------------+--+
3)分组查询,获取count(*)
select name,count(*) as counts from
(select name,records,dt,rank,date_sub(dt,rank) as new_dt
from
(select name,dt,records,
row_number() over(partition by name order by dt) as rank
from u_homework) tmp) tmp2
group by name,new_dt;
+-------+------+--+
| name | _c1 |
+-------+------+--+
| A | 3 |
| A | 5 |
| B | 2 |
| B | 1 |
| C | 1 |
| C | 4 |
| D | 2 |
| E | 2 |
+-------+------+--+
4)获取count>=3的数据,并去重
select distinct tmp3.name from
(select name,count(*) as counts from
(select name,records,dt,rank,date_sub(dt,rank) as new_dt
from
(select name,dt,records,
row_number() over(partition by name order by dt) as rank
from u_homework) tmp) tmp2
group by name,new_dt) tmp3 where counts>=3;
+------------+--+
| tmp3.name |
+------------+--+
| A |
| C |
+------------+--+