类别 | 解释 |
DDL(数据定义语言,Data Definition Language) | 对数据库的操作:创建、修改数据库 |
对数据表的操作:内部表、外部表、分区表、分桶表 | |
DQL(数据查询语言,Data Query Language) | 单表查询、多表查询 |
常用函数:聚合函数、条件函数、日期函数、字符串函数等 | |
高级函数:多行合并为一行,一行拆分为多行 | |
窗口函数、增强聚合函数 | |
DML(数据操纵语言,Data Manipulation Language) | insert into |
insert overwrite | |
update,不支持 | |
delete,不支持 | |
DCL(数据控制语言,Data Control Language) | grant,授权 |
revoke,撤销 |
一、hive的DDL语法
1.数据库的操作
-
创建数据库
create database id not exists 库名 coment '数据库注释';
-
修改数据库
-- 切换数据库
use 库名;
-- 展示所有库
show databases;
-- 展示当前库下所有表
show tables;
-- 修改数据库
alter database 库名 set dbproperties('属性名称'='属性值'); -- 不支持修改库名
-
查看数据库的详细信息
-- 查看数据库的基本信息
desc database 库名;
-- 查看数据库的详细信息
desc database extended 库名;
-
删除数据库
drop database 库名; -- 只能删除空白库(数据库中没有表),库内存在数据表会报错
drop database 库名 cascade; -- 强制删除数据库(不管库内是否存在数据表)
2.数据表的操作
2.1内部表的操作
-
创建内部表
-- 创建表
create table if not exists test.student(id int,name string);
-- 插入一条数据
insert into test.student value(1,'zhangsan');
-- 插入多条数据
insert into test.student value(2,'lisi'),(3,'wangwu');
-- 通过查询语句插入数据
insert into test.student_new select * from test.student;
-- 通过查询语句创建表
create table test.student_select as select * from test.student;
-- 通过现有表结构创建表
create table test.student_like like test.student;
-- 查看表的基本信息
desc 表名;
-- 查看表的详细信息
desc formatted 表名;
-- 查看表的建表语句
show create table 表名;
2.2外部表的操作
-
创建外部表
create extrenal table if not exists test.student(
id int,
name string)
row format delimied
fields terminated by ',';
-
从本地文件系统加载数据
-- 从本地加载数据,追加操作
load data local inpath '服务器数据表路径'
into table 库名.表名;
-- 从本地加载数据,覆盖操作
load data local inpath '服务器数据表路径'
overwrite into table 库名.表名;
-
从hdfs文件系统加载数据
-- 将本地文件上传至hdfs
hadoop fs -put 数据文件路径 目标路径 -- 在finallshell中输入运行
-- 修改文件夹的所有者
hadoop fs -chown -R root 目标路径 -- 在finallshell中输入运行
-- 从hdfs加载数据,追加操作
load data inpath '文件路径'
into table 库名.表名
-- 从hdfs加载数据,覆盖操作
load data inpath '文件路径'
overwrite into table 库名.表名
-- 注意:从分布式文件系统中加数据会移动文件的位置
2.3分区表的操作
-
创建分区表,一个分区
create table 库名.表名(字段名 字段格式, ...)
partitioned by (分区名 string)
row format delimited
fields terminated by '分隔符';
-- 从本地加载数据,并指定分区(文件夹)
load data local inpath '数据文件地址'
into table 库名.表名 partition(分区名='生成的文件夹名');
-
创建分区表,多个分区
create table 库名.表名(字段名 字段格式, ...)
partitioned by (分区名1 string, 分区名2 string)
row format delimited
fields terminated by '分隔符';
-- 从本地加载数据,并指定分区(文件夹)
load data local inpath '数据文件地址'
into table 库名.表名 partition(分区名1='生成的文件夹名1', 分区名2='生成的文件夹名2'); -- 生成二级文件目录
-
查看分区/添加分区/删除分区
-- 查看分区
show partitions 库名.表名;
-- 添加一个分区(表下新增一个文件夹)
alter table 库名.表名 add partition(分区名1='生成的文件夹名');
-- 添加多个个分区(表下新增多个个文件夹)
alter table 库名.表名
add partition(分区名1='生成的文件夹名1')
add partition(分区名2='生成的文件夹名2');
-- 删除分区
alter table 库名.表名 drop partition(分区名='文件夹名')
2.4分桶表的操作
-- 开启 hive 的桶表功能
set hive.enforce.bucketing=true;
-- 创建分桶表
creat table 库名.表名(字段名 字段格式, ...)
clustered by (分桶字段名) into 分为几个 buckets;
-- 分桶表的数据记载
-- 通过 hadoop fs -put 或 load data 加载,没有分桶效果
-- 所以只能通过 insert overwrite 加载
-- 必须先创建普通表,再通过 insert overwrite 将普通表的数据加载到分桶表
-- 创建普通表
creat table 库名.表名(字段名 字段格式, ...)
row format delimited
fields terminated by '字段分隔符';
-- 将数据加载到普通表
load data local inpath '数据表地址'
into table 库名.表名;
-- 将数据从普通表加载到分桶表
insert overwrite table 库名.表名(分桶表)
select * from 库名.表名(普通表)
cluster by (分桶字段)
2.5修改表和删除表
-- 修改表名称
alter table 库名.表名 rename to 库名.新表名;
-- 新增表字段
alter table 库名.表名 add columns(新字段名 字段格式, ...);
-- 修改表的字段类型
alter table 库名.表名 change column 字段名 新字段名(可以不修改) 字段新类型;
-- 删除表的字段,通过替换来删除,不支持 drop column
alter table 库名.表名 replace columns(在此重新定义表的字段,不定义要删除的字段);
-- 删除表(外部表只会删除元数据不会删除数据文件)
drop table 库名.表名;
-- 清空表数据
truncate table 库名.表名;
2.6表数据加载
-
通过insert into方式,向分区表加载数据
-- 创建分区表
create table 库名.表名(字段名 字段类型, ...)
partitioned by (分区名 数据类型,如string)
row format delimited
fields terminaed by ',';
-- 追加更新
insert into 库名.表名 partition(分区名='分区条件值') values(值1,值2, ...);
-- 覆盖更新
insert overwrite table 库名.表名 partition(分区名='分区条件值') values(值1,值2, ...);
-
通过load data方式,向分区表加载数据
-- 从本地加载数据,并指定分区(文件夹)
load data local inpath '数据文件地址'
into table 库名.表名 partition(分区名1='生成的文件夹名1', 分区名2='生成的文件夹名2'); -- 生成二级文件目录
-
通过查询方式,向分区表加载数据
-- 追加更新
insert into table 库名.表名 partition(分区名='分区条件值')
select 字段1,字段2,字段3, ... from 库名1.表名1;
-- 覆盖更新
insert overwrite table 库名.表名 partition(分区名='分区条件值')
select 字段1,字段2,字段3, ... from 库名1.表名1;
-
通过location方式,从hdfs加载数据(外部表)
-- 创建外部表,external 关键字
create external table if not exists 库名.表名(字段1 字段格式1, ...)
row format delimited
fields terminated by ',';
-- 指定 hdfs 存储位置
local '目标地址';
-
表的export导出与import导入(内部表)
-- 创建目录
hadoop fs -mkdir 创建地址
-- 更改文件的归属
hadoop fs -chown -R root 文件地址
-- export 导出表数据到hdfs
export table 库名.表名 to '目标地址';
-- import 倒入数据
-- 必须是export导出的数据,目录下必须得有元数据文件和数据文件
import table 库名.表名 from '目标地址';
2.7表数据导出
-
insert导出
-- 将查询结果导出至本地
insert overwrite local directory '目标地址'
select * from 库名.表名;
-- 将查询结果格式化后导出至本地
insert overwrite local directory '目标地址'
row format delimited fields terminated by '分隔符,如\t'
select * from 库名.表名;
-- 将查询结果格式化后导出至 hdfs
insert overwrite directory '目标地址'
row format delimited fields terminated by '分隔符,如\t'
select * from 库名.表名;
-
Hadoop shell命令导出到本地
hadoop fs -get 目标源表路径 本地路径(导出的目标路径)
-
Hive shell命令导出到本地
hive -e "select * from 库名.表名;" > 导出目标文件路径
二、hive的DQL语法
1.单表查询
select all | distinct 字段1,字段2, ...
from 库名.表名
join 库名1.表名1 on 连接条件
where 分组前筛选
group by 分组字段
having 分组后筛选
cluster by 分区及局部排序字段 | distribute by 分区字段
sort by 局部排序字段 | order by 全局排序字段
limit 限制行数
-- cluster by 和 distribute by 二选一
-- sort by 和 order by 二选一
-- cluster by 具有 distribute by 分区功能,也具有sort by 局部排序的功能
-- cluster by 不能指定排序规则,只能升序
-- distribute by 和 sort by 为同一字段,那么 cluster by 等价于 distribute by + sort by
-
where 筛选
select *
from 库名.表名
where 筛选条件 ;
-
group by 分组
select *
from 库名.表名
group by 分组字段
having 筛选条件 ;
2.多表查询
-
join 连接
-- 内链接
select *
from 库名.表名 as a
inner join 库名1.表名1 as b on a.字段1 = b.字段2
-- 左链接
select *
from 库名.表名 as a
left join 库名1.表名1 as b on a.字段1 = b.字段2
-- 右链接
select *
from 库名.表名 as a
right join 库名1.表名1 as b on a.字段1 = b.字段2
-- 外链接
select *
from 库名.表名 as a
outer join 库名1.表名1 as b on a.字段1 = b.字段2
-- 全链接
select *
from 库名.表名 as a
full join 库名1.表名1 as b on a.字段1 = b.字段2
-
order by 全局排序
order by 排序字段 asc | desc
-- 全局排序,无论设置reduce个数为多少,都只有1个reduce执行排序
-
sort by 局部排序
sort by 排序字段 asc | desc
-- 设置reduce个数为3,3个reduce内部排序,对全局数据集来说是无序的
-- 设置reduce个数为1,局部排序和全局排序结果一致
-
distribute by + sort by 分区排序
-- 设置reduce个数为5
set mapreduce.job.reduces=5;
-- 查看reduce个数
set mapreduce.job.reduces;
select * from score distribute by c_id sort by s_score asc;
-- 设置reduce个数为5,将每个c_id划分到对应reduce中
-- 5个分区内部排序,对全局数据集来说是无序的
-
cluster by 分区排序
-- 设置reduce个数为1
set mapreduce.job.reduces=1;
-- 查看reduce个数
set mapreduce.job.reduces;
select * from score distribute by c_id sort by s_score;
select * from score order by s_score;
-- 设置reduce个数为1,分区排序和全局排序结果一致
三、窗口函数
1.多表关联
-- 题目:统计各产品子类的退货数量、退货率
create table sm.sm_order_total
select *
from sm.sm_order_detail sod
left join sm.sm_order_info soi on soi.order_id = sod.order_id
left join sm.sm_customer_info sci on sci.customer_id = soi.customer_id
left join sm.sm_product_info spi on spi.product_id = sod.product_id
left join sm.sm_region_info sri on sri.region_id = soi.region_id
left join sm.sm_return_info sri2 on sri2.order_id = sod.order_id
and sri2.product_id = sod.product_id;
select
t.subclass as `产品子类`
, sum(t.quantity) as `销售数量`
, sum(case when t.return_date is not null then t.quantity else 0 end) as `退货数量`
, sum(case when t.return_date is not null then t.quantity else 0 end) / sum(t.quantity) as `退货率`
from sm.sm_order_total t
group by t.subclass;
2.不等值筛选
-- 题目:筛选年龄不等于30的订单
-- 比较运算符的筛选不包含 null
select *
from sm.sm_order_total
where age <> 30 or age is null;
select *
from sm.sm_order_total
where nvl(age, '') <> '30';
select *
from sm.sm_order_total
where COALESCE(age, gender, '') <> '30';
3.分组聚合_数值
-- 以区域、省份为分组,统计销售数量,销售金额,利润金额,并以销售金额降序
-- region|province|quantity|amount |profit |
-- ------+--------+--------+----------+---------+
-- 华东 |山东 | 3460|1380451.11| 260285.3|
-- 华东 |江苏 | 2070| 713635.19|-17808.62|
-- 华东 |浙江 | 1636| 632794.95| 1322.52|
select
t.region
, t.province
, sum(t.quantity) as quantity
, sum(t.profit) as profit
, sum(t.quantity * t.price) as amount
, sum(t.profit) / sum(t.quantity * t.price) as `利用率`
from sm.sm_order_total t
group by t.region ,t.province
-- order by 仅支持别名
order by amount desc;
4.分组聚合_文本
-- 以产品大类为分组,统计销售数量,同时聚合产品子类(去重,多行转一行)
-- collect_set,去重汇总,返回array数据类型
-- collect_list,不去重汇总,返回array数据类型
-- Hive不支持group_concat()函数
-- MySQL支持group_concat()函数
-- Hive支持concat_ws()函数
-- Hive的文本聚合
select
t.category
, concat_ws(',',collect_set(t.subclass)) as subclass
, sum(t.quantity) as quantity
from sm.sm_order_total t
group by t.category;
-- MySQL的文本聚合
select
sot.category as `产品大类`
, group_concat(
distinct sot.subclass
order by sot.subclass desc
separator '/') as `产品子类`
, sum(sot.quantity) as `销售数量`
from db.sm_order_total sot
group by sot.category
-- 产品大类 产品子类 销售数量
-- 办公 用品,收纳具,装订机,系固件,纸张,器具,标签,信封,美术 21690
-- 家具 椅子,用具,书架,桌子 8029
-- 技术 设备,配件,电话,复印机 7655
5.数据透视
-- 以区域为分组,统计各产品大类的销售数量
select
t.region
, sum(case when t.category = '办公' then t.quantity else 0 end) as `办公`
, sum(case when t.category = '技术' then t.quantity else 0 end) as `技术`
, sum(case when t.category = '家具' then t.quantity else 0 end) as `家具`
from sm.sm_order_total t
group by t.region;
-- 以区域为分组,统计各产品大类的订单编号数量(去重)
select
t.region
, count(distinct case when t.category = '办公' then t.order_id else null end) as `办公`
, count(distinct case when t.category = '技术' then t.order_id else null end) as `技术`
, count(distinct case when t.category = '家具' then t.order_id else null end) as `家具`
from sm.sm_order_total t
group by t.region;
6.分组排名
-- 统计各产品大类中销售金额最高的产品子类及对应销售金额
-- 方法一
select t1.*
from
(select
t.category
, t.subclass
, round(sum(t.quantity * t.price),2) as amount
from sm.sm_order_total t
group by t.category , t.subclass
) as t1
, (
select t1.category, max(t1.amount) as amount
from
(select
t.category
, t.subclass
, round(sum(t.quantity * t.price),2) as amount
from sm.sm_order_total t
group by t.category , t.subclass
) as t1
group by t1.category) as t2
where t1.category = t2.category and t1.amount = t2.amount;
-- 方法二
select category, subclass,amount
from
(select t1.category, t1.subclass, t1.amount
, row_number() over(partition by t1.category order by t1.amount desc) as rn
from
(select
t.category
, t.subclass
, round(sum(t.quantity * t.price),2) as amount
from sm.sm_order_total t
group by t.category , t.subclass
) as t1
) as t2
where rn = 1;
-- category|subclass|amount |
-- --------+--------+----------+
-- 办公 |器具 |1887456.34|
-- 家具 |椅子 | 2047331.7|
-- 技术 |复印机 |1847595.59|
7.窗口函数——聚合
-- 窗口函数() over(partition by 分组字段 order by 排序字段 rows|range between ... and ...)
-- 聚合函数:count、sum、max、min、avg
-- unbounded preceding,首行
-- unbounded following,尾行
-- current row,当前行
-- 3 preceding,前3行
-- 3 following,后3行
-- 分组字段:订单月份,排序字段:订单日期,指标:销售数量求和
-- 窗口范围:
-- 1.组内所有行
-- 2.组内首行至当前行
-- 3.组内当前行至尾行: rows between current row and unbounded following
-- 4.组内当前行的前3行到当前行: rows between 3 preceding and current row
-- 5.组内当前行到当前行的后3行: rows between current row and 3 following
-- 6.组内当前行的前3行到当前行的后3行: rows between 3 preceding and 3 following
select
t.order_month, t.order_date, t.quantity
, sum(t.quantity)over(partition by t.order_month) as `组内所有行`
, sum(t.quantity)over(
partition by t.order_month
order by t.order_date
) as `组内首行至当前行`
, sum(t.quantity)over(
partition by t.order_month
order by t.order_date
rows between current row and unbounded following
) as `组内当前行至尾行`
, sum(t.quantity)over(
partition by t.order_month
order by t.order_date
rows between 3 preceding and current row
) as `组内当前行的前3行到当前行`
, sum(t.quantity)over(
partition by t.order_month
order by t.order_date
rows between current row and 3 following
) as `组内当前行到当前行的后3行`
, sum(t.quantity)over(
partition by t.order_month
order by t.order_date
rows between 3 preceding and 3 following
) as `组内当前行的前3行到当前行的后3行`
from sm.sm_order_month t;
-- 创建表
create table sm.sm_order_month as
select
substring(order_date, 1, 7) as order_month
, order_date
, sum(quantity) as quantity
from sm.sm_order_total
group by substring(order_date, 1, 7), order_date
order by order_date asc
-- 窗口函数的其他写法
-- 写在order by后面, limit前面
-- hive支持, mysql不支持
select
t.order_month, t.order_date, t.quantity
, sum(t.quantity)over w as `sum组内所有行`
, max(t.quantity)over w as `max组内所有行`
from sm.sm_order_month t
where order_month = '2017-01'
order by order_date
window w as (partition by t.order_month)
limit 10;
-- 窗口范围的range写法,主要针对时间序列/数值
-- 时间序列/数值连续:rows或range
-- 时间序列/数值不连续:range
select
t.order_month, t.order_date, t.quantity
, sum(t.quantity)over(
partition by order_month
order by order_date
rows between 2 preceding and current row) as `近3天销量rows`
, sum(t.quantity)over(
partition by order_month
order by order_date
range between 2 preceding and current row) as `近3天销量range`
from sm.sm_order_month t;
-- MySQL
-- range between interval 3 day preceding and interval 3 day following
-- 开始节点:当前行的日期-3天,结束节点:当前行的日期+3天,最多7行
8.窗口函数——排序
-- row_number、rank、dense_rank
-- 应用场景:分组排序,生成序号
-- row_number:生成唯一的序号,序号不会并列,不会出现空缺
-- rank:会出现并列名次,当出现并列名次时下位名次会进行空缺,例如:1 2 2 4
-- dense_rank:并列名次时下位是连续的,例如:1 2 2 3
select
t.region ,t.province ,t.quantity
, row_number() over(partition by t.region order by t.quantity desc) as `row_number`
, rank() over(partition by t.region order by t.quantity desc) as `rank`
, dense_rank() over(partition by t.region order by t.quantity desc) as `dense_rank`
from
(
select
region ,province ,round(sum(quantity), -2) as quantity
from sm.sm_order_total
group by region ,province
order by region ,province
) as t;
9.窗口函数——分布
-- 百分比排名
-- Hive不支持Window窗口范围子句
-- cume_dist: 小于等于当前行rank值的行数 / 总行数
-- percent_rank:(当前行rank值-1) / (总行数-1)
select
t.region ,t.province ,t.quantity
, rank() over(partition by t.region order by t.quantity desc) as `rank`
, count(province) over(partition by t.region) as `总数`
, cume_dist() over(partition by t.region order by t.quantity desc) as `cume_dist`
, percent_rank() over(partition by t.region order by t.quantity desc) as `percent_rank`
from
(
select
region ,province ,round(sum(quantity), -2) as quantity
from sm.sm_order_total
group by region ,province
order by region ,province
) as t;
10.窗口函数——偏移(相对位置)
-- 应用场景:同比、环比增长率
-- lead(),超前,向前(未来)偏移/向下偏移
-- lag(),滞后,向后(过去)偏移/向上偏移
-- 第一个参数:偏移的字段;第二个参数:偏移的行数;第三个参数(可选):如果返回值为空,以什么值来填充
select
t.order_month as `订单月份`
, t.quantity as `当月销量`
, lag(t.quantity,1, 0) over(order by t.order_month) as `上月销量`
, lag(t.quantity,12, 0) over(order by t.order_month) as `去年同期销量`
, (t.quantity / lag(t.quantity,1, 0) over(order by t.order_month) - 1) as `环比增长率`
, (t.quantity / lag(t.quantity,12, 0) over(order by t.order_month) - 1) as `同比增长率`
from
(
select
order_month, sum(quantity) as quantity
from sm.sm_order_month
group by order_month
) as t;
11.窗口函数——偏移(绝对位置)
-- first_value(expr), 返回首行的值
-- last_value(expr), 返回尾行的值, 默认窗口范围为首行至当前行
-- nth_value(expr, n), 返回指定行的值, 默认窗口范围为首行至当前行
-- Hive不支持nth_value
select
t.category as `产品大类`
, t.subclass as `产品小类`
, t.quantity as `销量`
-- 组内销量排名第一的产品子类
-- , first_value(t.subclass) over(partition by t.category order by t.quantity desc) as `组内销量排名第一的产品子类`
-- , last_value(t.subclass) over(partition by t.category order by t.quantity asc
-- rows between unbounded preceding and unbounded following) as `组内销量排名第一的产品子类`
-- 组内销量排名倒数第一的产品子类
-- , first_value(t.subclass) over(partition by t.category order by t.quantity asc) as `组内销量排名倒数第一的产品子类`
, last_value(t.subclass) over(partition by t.category order by t.quantity desc
rows between unbounded preceding and unbounded following) as `组内销量排名倒数第一的产品子类`
from
(
select
category ,subclass ,sum(quantity) as quantity
from sm.sm_order_total
group by category ,subclass
) as t;
12.窗口函数——分桶
-- 应用场景:数据分组抽样
-- ntile(n),将数据集分为n个桶,返回分桶的序号
select *
from
(
select
t.region, t.province, t.quantity
-- 排序分桶
-- , ntile(2) over(partition by t.region order by t.quantity desc) as ntile_desc_2
-- 随机分桶
, ntile(2) over(partition by t.region order by rand() desc) as ntile_desc_2
from
(
select
region, province, sum(quantity) as quantity
from sm.sm_order_total
group by region,province
) as t
) as t1
where t1.ntile_desc_2 = 1;
13.增强聚合
-- with rollup,多个分组字段,从左到右按层次进行维度组合,包含小计和合计
-- with cube,多个分组字段,所有维度组合,MySQL暂不支持
-- grouping sets(字段1, (字段1, 字段2)),自定义维度组合,MySQL暂不支持
-- grouping__id,维度组合的ID,MySQL暂不支持
-- group by + rollup
select
region, province ,sum(quantity) as quantity
from sm.sm_order_total
group by region,province with rollup;
-- group by + union
select
region, category ,sum(quantity) as quantity
from sm.sm_order_total
group by region, category
union
select
region, '' as category ,sum(quantity) as quantity
from sm.sm_order_total
group by region
union
select
'' as region, '' as category ,sum(quantity) as quantity
from sm.sm_order_total;
-- group by + cube
select
region, province ,sum(quantity) as quantity
from sm.sm_order_total
group by region,province with cube;
-- group by + grouping sets
select
region, province ,sum(quantity) as quantity
, grouping__id
from sm.sm_order_total
group by region,province grouping sets(region, (region,province));
14.用户连续下单天数
-- 统计每个用户最长的连续下单天数
create table sm.sm_customer_id_order_date as
select
distinct customer_id ,order_date
from sm.sm_order_total;
select
t2.customer_id, max(days) as max_days
from
(
select
t1.customer_id
-- 关键点, date_sub(order_date, interval rn day) as date_group ------ MySQL写法
, date_sub(order_date, rn) as date_group
, count(*) as days
from
(
select
customer_id,order_date
, row_number() over(partition by customer_id order by order_date) as rn
from sm.sm_customer_id_order_date
) as t1
group by t1.customer_id, date_sub(order_date, rn)
) as t2
group by t2.customer_id;