Hive SQL

巷子里的猫X

已于 2022-10-13 19:04:07 修改

阅读量716

点赞数 2

分类专栏：数据分析文章标签： hive sql 数据库

于 2022-10-11 18:50:20 首次发布

本文链接：https://blog.csdn.net/qq_52421831/article/details/127268325

版权

数据分析专栏收录该内容

21 篇文章 31 订阅

订阅专栏

类别	解释
DDL（数据定义语言，Data Definition Language）	对数据库的操作：创建、修改数据库
DDL（数据定义语言，Data Definition Language）	对数据表的操作：内部表、外部表、分区表、分桶表
DQL（数据查询语言，Data Query Language）	单表查询、多表查询
	常用函数：聚合函数、条件函数、日期函数、字符串函数等
	高级函数：多行合并为一行，一行拆分为多行
	窗口函数、增强聚合函数
DML（数据操纵语言，Data Manipulation Language）	insert into
	insert overwrite
	update，不支持
	delete，不支持
DCL（数据控制语言，Data Control Language）	grant，授权
DCL（数据控制语言，Data Control Language）	revoke，撤销

一、hive的DDL语法

1.数据库的操作

创建数据库

create database id not exists 库名 coment '数据库注释';

修改数据库

-- 切换数据库
use 库名;
-- 展示所有库
show databases;
-- 展示当前库下所有表
show tables;

-- 修改数据库
alter database 库名 set dbproperties('属性名称'='属性值');    -- 不支持修改库名

查看数据库的详细信息

-- 查看数据库的基本信息
desc database 库名;

-- 查看数据库的详细信息
desc database extended 库名;

删除数据库

drop database 库名;    -- 只能删除空白库（数据库中没有表），库内存在数据表会报错
drop database 库名 cascade;    -- 强制删除数据库（不管库内是否存在数据表）

2.数据表的操作

2.1内部表的操作

创建内部表

-- 创建表
create table if not exists test.student(id int,name string);
-- 插入一条数据
insert into test.student value(1,'zhangsan');
-- 插入多条数据
insert into test.student value(2,'lisi'),(3,'wangwu');


-- 通过查询语句插入数据
insert into test.student_new select * from test.student;

-- 通过查询语句创建表
create table test.student_select as select * from test.student;

-- 通过现有表结构创建表
create table test.student_like like test.student;

-- 查看表的基本信息
desc 表名;
-- 查看表的详细信息
desc formatted 表名;
-- 查看表的建表语句
show create table 表名;

2.2外部表的操作

创建外部表

create extrenal table if not exists test.student(
  id int,
  name string)
row format delimied
fields terminated by ',';

从本地文件系统加载数据

-- 从本地加载数据，追加操作
load data local inpath '服务器数据表路径'
into table 库名.表名;

-- 从本地加载数据，覆盖操作
load data local inpath '服务器数据表路径'
overwrite into table 库名.表名;

从hdfs文件系统加载数据

-- 将本地文件上传至hdfs
hadoop fs -put 数据文件路径 目标路径 -- 在finallshell中输入运行
-- 修改文件夹的所有者
hadoop fs -chown -R root 目标路径  -- 在finallshell中输入运行

-- 从hdfs加载数据，追加操作
load data inpath '文件路径'
into table 库名.表名

-- 从hdfs加载数据，覆盖操作
load data inpath '文件路径'
overwrite into table 库名.表名

-- 注意：从分布式文件系统中加数据会移动文件的位置

2.3分区表的操作

创建分区表，一个分区

create table 库名.表名(字段名 字段格式, ...)
partitioned by (分区名 string)
row format delimited
fields terminated by '分隔符';

-- 从本地加载数据，并指定分区（文件夹）
load data local inpath '数据文件地址'
into table 库名.表名 partition(分区名='生成的文件夹名');

创建分区表，多个分区

create table 库名.表名(字段名 字段格式, ...)
partitioned by (分区名1 string, 分区名2 string)
row format delimited
fields terminated by '分隔符';

-- 从本地加载数据，并指定分区（文件夹）
load data local inpath '数据文件地址'
into table 库名.表名 partition(分区名1='生成的文件夹名1', 分区名2='生成的文件夹名2');    -- 生成二级文件目录

查看分区/添加分区/删除分区

-- 查看分区
show partitions 库名.表名;

-- 添加一个分区（表下新增一个文件夹）
alter table 库名.表名 add partition(分区名1='生成的文件夹名');
-- 添加多个个分区（表下新增多个个文件夹）
alter table 库名.表名 
add partition(分区名1='生成的文件夹名1')
add partition(分区名2='生成的文件夹名2');

-- 删除分区
alter table 库名.表名 drop partition(分区名='文件夹名')

2.4分桶表的操作

-- 开启 hive 的桶表功能
set hive.enforce.bucketing=true;
-- 创建分桶表
creat table 库名.表名(字段名 字段格式, ...)
clustered by (分桶字段名) into 分为几个 buckets;

-- 分桶表的数据记载
-- 通过 hadoop fs -put 或 load data 加载，没有分桶效果
-- 所以只能通过 insert overwrite 加载
-- 必须先创建普通表，再通过 insert overwrite 将普通表的数据加载到分桶表
-- 创建普通表
creat table 库名.表名(字段名 字段格式, ...)
row format delimited
fields terminated by '字段分隔符';
-- 将数据加载到普通表
load data local inpath '数据表地址'
into table 库名.表名;
-- 将数据从普通表加载到分桶表
insert overwrite table 库名.表名（分桶表）
select * from 库名.表名（普通表）
cluster by (分桶字段)

2.5修改表和删除表

-- 修改表名称
alter table 库名.表名 rename to 库名.新表名;

-- 新增表字段
alter table 库名.表名 add columns(新字段名 字段格式, ...);

-- 修改表的字段类型
alter table 库名.表名 change column 字段名 新字段名(可以不修改) 字段新类型;

-- 删除表的字段，通过替换来删除，不支持 drop column
alter table 库名.表名 replace columns(在此重新定义表的字段，不定义要删除的字段);

-- 删除表（外部表只会删除元数据不会删除数据文件）
drop table 库名.表名;

-- 清空表数据
truncate table 库名.表名;

2.6表数据加载

通过insert into方式，向分区表加载数据

-- 创建分区表
create table 库名.表名(字段名 字段类型, ...)
partitioned by (分区名 数据类型,如string)
row format delimited
fields terminaed by ',';

-- 追加更新
insert into 库名.表名 partition(分区名='分区条件值') values(值1,值2, ...);

-- 覆盖更新
insert overwrite table 库名.表名 partition(分区名='分区条件值') values(值1,值2, ...);

通过load data方式，向分区表加载数据

-- 从本地加载数据，并指定分区（文件夹）
load data local inpath '数据文件地址'
into table 库名.表名 partition(分区名1='生成的文件夹名1', 分区名2='生成的文件夹名2');    -- 生成二级文件目录

通过查询方式，向分区表加载数据

-- 追加更新
insert into table 库名.表名 partition(分区名='分区条件值')
select 字段1,字段2,字段3, ... from 库名1.表名1; 

-- 覆盖更新
insert overwrite table 库名.表名 partition(分区名='分区条件值')
select 字段1,字段2,字段3, ... from 库名1.表名1;

通过location方式，从hdfs加载数据（外部表）

-- 创建外部表，external 关键字
create external table if not exists 库名.表名(字段1 字段格式1, ...)
row format delimited
fields terminated by ',';
-- 指定 hdfs 存储位置
local '目标地址';

表的export导出与import导入（内部表）

-- 创建目录
hadoop fs -mkdir 创建地址
-- 更改文件的归属
hadoop fs -chown -R root 文件地址
-- export 导出表数据到hdfs
export table 库名.表名 to '目标地址';

-- import 倒入数据
-- 必须是export导出的数据，目录下必须得有元数据文件和数据文件
import table 库名.表名 from '目标地址';

2.7表数据导出

insert导出

-- 将查询结果导出至本地
insert overwrite local directory '目标地址'
select * from 库名.表名;
-- 将查询结果格式化后导出至本地
insert overwrite local directory '目标地址'
row format delimited fields terminated by '分隔符,如\t'
select * from 库名.表名;
-- 将查询结果格式化后导出至 hdfs
insert overwrite directory '目标地址'
row format delimited fields terminated by '分隔符,如\t'
select * from 库名.表名;

Hadoop shell命令导出到本地

hadoop fs -get 目标源表路径本地路径(导出的目标路径)

Hive shell命令导出到本地

hive -e "select * from 库名.表名;" > 导出目标文件路径

二、hive的DQL语法

1.单表查询

select all ｜ distinct 字段1,字段2, ... 
from 库名.表名
join 库名1.表名1 on 连接条件
where 分组前筛选
group by 分组字段
having 分组后筛选
cluster by 分区及局部排序字段 ｜ distribute by 分区字段
sort by 局部排序字段 ｜ order by 全局排序字段
limit 限制行数

-- cluster by 和 distribute by 二选一
-- sort by 和 order by 二选一
-- cluster by 具有 distribute by 分区功能，也具有sort by 局部排序的功能
-- cluster by 不能指定排序规则，只能升序

-- distribute by 和 sort by 为同一字段，那么 cluster by 等价于 distribute by + sort by

where 筛选

select *
from 库名.表名
where 筛选条件 ;

group by 分组

select *
from 库名.表名
group by 分组字段
having 筛选条件 ;

2.多表查询

join 连接

-- 内链接
select * 
from 库名.表名 as a
inner join 库名1.表名1 as b on a.字段1 = b.字段2
-- 左链接
select * 
from 库名.表名 as a
left join 库名1.表名1 as b on a.字段1 = b.字段2
-- 右链接
select * 
from 库名.表名 as a
right join 库名1.表名1 as b on a.字段1 = b.字段2
-- 外链接
select * 
from 库名.表名 as a
outer join 库名1.表名1 as b on a.字段1 = b.字段2
-- 全链接
select * 
from 库名.表名 as a
full join 库名1.表名1 as b on a.字段1 = b.字段2

order by 全局排序

order by 排序字段 asc ｜ desc
-- 全局排序，无论设置reduce个数为多少，都只有1个reduce执行排序

sort by 局部排序

sort by 排序字段 asc ｜ desc
-- 设置reduce个数为3，3个reduce内部排序，对全局数据集来说是无序的
-- 设置reduce个数为1，局部排序和全局排序结果一致

distribute by + sort by 分区排序

-- 设置reduce个数为5
set mapreduce.job.reduces=5;
-- 查看reduce个数
set mapreduce.job.reduces;

select * from score distribute by c_id sort by s_score asc;

-- 设置reduce个数为5，将每个c_id划分到对应reduce中
-- 5个分区内部排序，对全局数据集来说是无序的

cluster by 分区排序

-- 设置reduce个数为1
set mapreduce.job.reduces=1;
-- 查看reduce个数
set mapreduce.job.reduces;

select * from score distribute by c_id sort by s_score;
select * from score order by s_score;

-- 设置reduce个数为1，分区排序和全局排序结果一致

三、窗口函数

1.多表关联

-- 题目：统计各产品子类的退货数量、退货率
create table sm.sm_order_total
select *
from sm.sm_order_detail sod
left join sm.sm_order_info soi on soi.order_id = sod.order_id
left join sm.sm_customer_info sci on sci.customer_id = soi.customer_id
left join sm.sm_product_info spi on spi.product_id = sod.product_id
left join sm.sm_region_info sri on sri.region_id = soi.region_id
left join sm.sm_return_info sri2 on sri2.order_id = sod.order_id
	and sri2.product_id = sod.product_id;

select 
	t.subclass as `产品子类`
	, sum(t.quantity) as `销售数量`
	, sum(case when t.return_date is not null then t.quantity else 0 end) as `退货数量`
	, sum(case when t.return_date is not null then t.quantity else 0 end) / sum(t.quantity) as `退货率`
from sm.sm_order_total t
group by t.subclass;

2.不等值筛选

-- 题目：筛选年龄不等于30的订单
-- 比较运算符的筛选不包含 null
select *
from sm.sm_order_total
where age <> 30 or age is null;


select *
from sm.sm_order_total
where nvl(age, '') <> '30';


select *
from sm.sm_order_total
where COALESCE(age, gender, '') <> '30';

3.分组聚合_数值

-- 以区域、省份为分组，统计销售数量，销售金额，利润金额，并以销售金额降序

-- region|province|quantity|amount    |profit   |
-- ------+--------+--------+----------+---------+
-- 华东    |山东      |    3460|1380451.11| 260285.3|
-- 华东    |江苏      |    2070| 713635.19|-17808.62|
-- 华东    |浙江      |    1636| 632794.95|  1322.52|

select
	t.region
	, t.province 
	, sum(t.quantity) as quantity
	, sum(t.profit) as profit 
	, sum(t.quantity * t.price) as amount
	, sum(t.profit) / sum(t.quantity * t.price) as `利用率`
from sm.sm_order_total t
group by t.region ,t.province
-- order by 仅支持别名
order by amount desc;

4.分组聚合_文本

-- 以产品大类为分组，统计销售数量，同时聚合产品子类（去重，多行转一行）
-- collect_set，去重汇总，返回array数据类型
-- collect_list，不去重汇总，返回array数据类型
-- Hive不支持group_concat()函数
-- MySQL支持group_concat()函数
-- Hive支持concat_ws()函数

-- Hive的文本聚合
select
	t.category 
	, concat_ws(',',collect_set(t.subclass)) as subclass
	, sum(t.quantity) as quantity
from sm.sm_order_total t
group by t.category;

-- MySQL的文本聚合
select 
	sot.category as `产品大类`
	, group_concat(
		distinct sot.subclass 
		order by sot.subclass desc 
		separator '/') as `产品子类`
	, sum(sot.quantity) as `销售数量`
from db.sm_order_total sot 
group by sot.category 


-- 产品大类	产品子类											销售数量
-- 办公		用品,收纳具,装订机,系固件,纸张,器具,标签,信封,美术		21690
-- 家具		椅子,用具,书架,桌子								8029
-- 技术		设备,配件,电话,复印机								7655

5.数据透视

-- 以区域为分组，统计各产品大类的销售数量
select
	t.region
	, sum(case when t.category = '办公' then t.quantity else 0 end) as `办公`
	, sum(case when t.category = '技术' then t.quantity else 0 end) as `技术`
	, sum(case when t.category = '家具' then t.quantity else 0 end) as `家具`
from sm.sm_order_total t
group by t.region;

-- 以区域为分组，统计各产品大类的订单编号数量（去重）
select
	t.region
	, count(distinct case when t.category = '办公' then t.order_id else null end) as `办公`
	, count(distinct case when t.category = '技术' then t.order_id else null end) as `技术`
	, count(distinct case when t.category = '家具' then t.order_id else null end) as `家具`
from sm.sm_order_total t
group by t.region;

6.分组排名

-- 统计各产品大类中销售金额最高的产品子类及对应销售金额

-- 方法一
select t1.*
from 
	(select
		t.category 
		, t.subclass 
		, round(sum(t.quantity * t.price),2) as amount
	from sm.sm_order_total t 
	group by t.category , t.subclass 
	) as t1
	, (
		select t1.category, max(t1.amount) as amount
		from 
			(select
				t.category 
				, t.subclass 
				, round(sum(t.quantity * t.price),2) as amount
			from sm.sm_order_total t 
			group by t.category , t.subclass 
		) as t1
		group by t1.category) as t2
where t1.category = t2.category and t1.amount = t2.amount;


-- 方法二
select category, subclass,amount
from
	(select t1.category, t1.subclass, t1.amount
		, row_number() over(partition by t1.category order by t1.amount desc) as rn
	from 
		(select
			t.category 
			, t.subclass 
			, round(sum(t.quantity * t.price),2) as amount
		from sm.sm_order_total t 
		group by t.category , t.subclass 
		) as t1
	) as t2
where rn = 1;

-- category|subclass|amount    |
-- --------+--------+----------+
-- 办公      |器具      |1887456.34|
-- 家具      |椅子      | 2047331.7|
-- 技术      |复印机     |1847595.59|

7.窗口函数——聚合

-- 窗口函数() over(partition by 分组字段 order by 排序字段 rows|range between ... and ...)
-- 聚合函数：count、sum、max、min、avg
-- unbounded preceding，首行
-- unbounded following，尾行
-- current row，当前行
-- 3 preceding，前3行
-- 3 following，后3行

-- 分组字段：订单月份，排序字段：订单日期，指标：销售数量求和
-- 窗口范围：
-- 1.组内所有行
-- 2.组内首行至当前行
-- 3.组内当前行至尾行: rows between current row and unbounded following
-- 4.组内当前行的前3行到当前行: rows between 3 preceding and current row
-- 5.组内当前行到当前行的后3行: rows between current row and 3 following
-- 6.组内当前行的前3行到当前行的后3行: rows between 3 preceding and 3 following
select 
	t.order_month, t.order_date, t.quantity 
	, sum(t.quantity)over(partition by t.order_month) as `组内所有行`
	, sum(t.quantity)over(
		partition by t.order_month 
		order by t.order_date
		) as `组内首行至当前行`
	, sum(t.quantity)over(
		partition by t.order_month 
		order by t.order_date
		rows between current row and unbounded following
		) as `组内当前行至尾行`
	, sum(t.quantity)over(
		partition by t.order_month 
		order by t.order_date
		rows between 3 preceding and current row
		) as `组内当前行的前3行到当前行`
	, sum(t.quantity)over(
		partition by t.order_month 
		order by t.order_date
		rows between current row and 3 following
		) as `组内当前行到当前行的后3行`
	, sum(t.quantity)over(
		partition by t.order_month 
		order by t.order_date
		rows between 3 preceding and 3 following
		) as `组内当前行的前3行到当前行的后3行`
from sm.sm_order_month t;


-- 创建表
create table sm.sm_order_month as 
select 
	substring(order_date, 1, 7) as order_month
	, order_date
	, sum(quantity) as quantity
from sm.sm_order_total
group by substring(order_date, 1, 7), order_date 
order by order_date asc


-- 窗口函数的其他写法
-- 写在order by后面, limit前面
-- hive支持, mysql不支持
select 
	t.order_month, t.order_date, t.quantity 
	, sum(t.quantity)over w as `sum组内所有行`
	, max(t.quantity)over w as `max组内所有行`
from sm.sm_order_month t
where order_month = '2017-01'
order by order_date
window w as (partition by t.order_month)
limit 10;


-- 窗口范围的range写法，主要针对时间序列/数值
-- 时间序列/数值连续：rows或range
-- 时间序列/数值不连续：range
select 
	t.order_month, t.order_date, t.quantity 
	, sum(t.quantity)over(
		partition by order_month
		order by order_date
		rows between 2 preceding and current row) as `近3天销量rows`
	, sum(t.quantity)over(
		partition by order_month
		order by order_date
		range between 2 preceding and current row) as `近3天销量range`
from sm.sm_order_month t;


-- MySQL
-- range between interval 3 day preceding and interval 3 day following 
-- 开始节点：当前行的日期-3天，结束节点：当前行的日期+3天，最多7行

8.窗口函数——排序

-- row_number、rank、dense_rank
-- 应用场景：分组排序，生成序号

-- row_number：生成唯一的序号，序号不会并列，不会出现空缺
-- rank：会出现并列名次，当出现并列名次时下位名次会进行空缺，例如：1 2 2 4
-- dense_rank：并列名次时下位是连续的，例如：1 2 2 3

select
	t.region ,t.province ,t.quantity
	, row_number() over(partition by t.region order by t.quantity desc) as `row_number`
	, rank() over(partition by t.region order by t.quantity desc) as `rank`
	, dense_rank() over(partition by t.region order by t.quantity desc) as `dense_rank`
from
(
	select 
		region ,province ,round(sum(quantity), -2) as quantity 
	from sm.sm_order_total
	group by region ,province
	order by region ,province
) as t;

9.窗口函数——分布

-- 百分比排名
-- Hive不支持Window窗口范围子句
-- cume_dist: 小于等于当前行rank值的行数 / 总行数
-- percent_rank：(当前行rank值-1) / (总行数-1)

select
	t.region ,t.province ,t.quantity
	, rank() over(partition by t.region order by t.quantity desc) as `rank`
	, count(province) over(partition by t.region) as `总数`
	, cume_dist() over(partition by t.region order by t.quantity desc) as `cume_dist`
	, percent_rank() over(partition by t.region order by t.quantity desc) as `percent_rank`
from
(
	select 
		region ,province ,round(sum(quantity), -2) as quantity 
	from sm.sm_order_total
	group by region ,province
	order by region ,province
) as t;

10.窗口函数——偏移(相对位置)

-- 应用场景：同比、环比增长率
-- lead()，超前，向前(未来)偏移/向下偏移
-- lag()，滞后，向后(过去)偏移/向上偏移
-- 第一个参数：偏移的字段；第二个参数：偏移的行数；第三个参数(可选)：如果返回值为空，以什么值来填充

select 
	t.order_month as `订单月份`
	, t.quantity 	as `当月销量`
	, lag(t.quantity,1, 0) over(order by t.order_month) as `上月销量`
	, lag(t.quantity,12, 0) over(order by t.order_month) as `去年同期销量`
	, (t.quantity / lag(t.quantity,1, 0) over(order by t.order_month) - 1) as `环比增长率`
	, (t.quantity / lag(t.quantity,12, 0) over(order by t.order_month) - 1) as `同比增长率`
from
(
	select 
		order_month, sum(quantity) as quantity
	from sm.sm_order_month
	group by order_month
) as t;

11.窗口函数——偏移(绝对位置)

-- first_value(expr), 返回首行的值
-- last_value(expr), 返回尾行的值, 默认窗口范围为首行至当前行
-- nth_value(expr, n), 返回指定行的值, 默认窗口范围为首行至当前行
-- Hive不支持nth_value

select
	t.category as `产品大类`
	, t.subclass as `产品小类`
	, t.quantity as `销量`
	-- 组内销量排名第一的产品子类
--	, first_value(t.subclass) over(partition by t.category order by t.quantity desc) as `组内销量排名第一的产品子类`
	
--	, last_value(t.subclass) over(partition by t.category order by t.quantity asc
--		rows between unbounded preceding and unbounded following) as `组内销量排名第一的产品子类`
	
	-- 组内销量排名倒数第一的产品子类
--	, first_value(t.subclass) over(partition by t.category order by t.quantity asc) as `组内销量排名倒数第一的产品子类`
	
	, last_value(t.subclass) over(partition by t.category order by t.quantity desc
		rows between unbounded preceding and unbounded following) as `组内销量排名倒数第一的产品子类`
from
(
	select 
		category ,subclass ,sum(quantity) as quantity 
	from sm.sm_order_total 
	group by category ,subclass 
) as t;

12.窗口函数——分桶

-- 应用场景：数据分组抽样
-- ntile(n)，将数据集分为n个桶，返回分桶的序号

select *
from
(
	select
		t.region, t.province, t.quantity
		-- 排序分桶
--		, ntile(2) over(partition by t.region order by t.quantity desc) as ntile_desc_2
		-- 随机分桶
		, ntile(2) over(partition by t.region order by rand() desc) as ntile_desc_2
	from
	(
		select 
			region, province, sum(quantity) as quantity 
		from sm.sm_order_total 
		group by region,province 
	) as t
) as t1
where t1.ntile_desc_2 = 1;

13.增强聚合

-- with rollup，多个分组字段，从左到右按层次进行维度组合，包含小计和合计
-- with cube，多个分组字段，所有维度组合，MySQL暂不支持
-- grouping sets(字段1, (字段1, 字段2))，自定义维度组合，MySQL暂不支持
-- grouping__id，维度组合的ID，MySQL暂不支持

-- group by + rollup
select 
	region, province ,sum(quantity) as quantity 
from sm.sm_order_total
group by region,province with rollup;


-- group by + union
select 
	region, category ,sum(quantity) as quantity 
from sm.sm_order_total
group by region, category
union 
select 
	region, '' as category ,sum(quantity) as quantity 
from sm.sm_order_total
group by region
union
select 
	'' as region, '' as category ,sum(quantity) as quantity 
from sm.sm_order_total;


-- group by + cube
select 
	region, province ,sum(quantity) as quantity 
from sm.sm_order_total
group by region,province with cube;


-- group by + grouping sets
select 
	region, province ,sum(quantity) as quantity 
	, grouping__id
from sm.sm_order_total
group by region,province grouping sets(region, (region,province));

14.用户连续下单天数

-- 统计每个用户最长的连续下单天数
create table sm.sm_customer_id_order_date as 
select
	distinct customer_id ,order_date
from sm.sm_order_total;


select
	t2.customer_id, max(days) as max_days
from
(
	select
		t1.customer_id
		-- 关键点, date_sub(order_date, interval rn day) as date_group  ------ MySQL写法
		, date_sub(order_date, rn) as date_group
		, count(*) as days
	from
	(
		select 
			customer_id,order_date
			, row_number() over(partition by customer_id order by order_date) as rn
		from sm.sm_customer_id_order_date
	) as t1
	group by t1.customer_id, date_sub(order_date, rn)
) as t2
group by t2.customer_id;