SELECT *
FROM edu.score
distribute BY rand()
sort BY s_score;
# 全局排序
SELECT * FROM eedu.score
ORDER BY rand()
# 按比例抽样
SELECT * FROM edu.score tablesample(10 precent)
# 按大小抽样,单位为字节
SELECT * FROM edu,score tablesample(20 0B)
# ORDER BY 全局排序: 谨慎使用,占用内存空间比较大
SELECT * FROM edu.score
ORDER BY rand()
LIMIT 10
# 分桶抽样: 随机分成10组,抽取其中一组数据
SELECT * FROM edu.score tablesample()
CREATE TABLE test.oerson(
id int,
name string,
--array数组/列表
email arry <string>,
--map字段k-v
score MAP<string,int>
--struct结构体/表单
addr struct<province:string,city:string,city_type:int>
--设置分隔符的语句
ROW format delimited
--字段之间的分隔符为制表符
fields terminated BY '\t'
--符合数据类型: struct\map\array
--符合数据类型字段的各项的分隔符
collection items terminated BY ','
--map键值对key与value的分隔符
MAP keys terminated BY ':'
--行之间的分隔符为换行符
lines terminated BY '\n'
load DATA local inpath 'data/import/person.text'
INTO TABLE test.person;
SELECT
id,
name,
email[0] AS email_1,
email[1] AS email_2,
score['语文'] AS ch,
score['数学'] AS math,
coalesce['英语',0] AS en,
addr.province,
addr.city,
addr.city_type
FROM test.person;
--题目: 统计各产品子类的退货数量.退货率
SELECT * FROM sm.sm_order_detail sod
LEFT join sm.sm_order_info soi ON soi.order_id = sod.order_id
LEFT join sm.sm_customer_info sci ON sci.customer_id = soi.customer_id
LEFT join sm.sm_product_info spi ON spi.product_id = spd.product_id
LEFT JOIN sm.sm_region_info sri ON sri.region_id = soi.region_id
LEFT JOIN sm.sm_return_info sri2 ON sri2.order_id = sod.order_id
AND sri2.product_id = sod.product_id
SELECT
t.subclass,
,sum(t.quantity) AS '销售数量'
--条件求值
,sum(CASE WHEN t.return_data IS NOT THEN t.quantity ELSE 0 END) AS '退货数量'
,sum(CASE WHEN t.return_data IS NOT THEN t.quantity ELSE 0 END)/sum(t.quantity)
FROM sm.sm_order_total t
GROUP BY t.subclass;
--题目: 筛选年龄不等于30的订单
--比较运算符的筛选不含null
SELECT *
FROM sm.sm_order_total
WHERE age <> 30 OR age IS NULL;
--以区域.省份分组,统计销售数量,销售金额,利润金额,并以销售金额降序
SELECT
t.region
,t.province
,sum(t.quantity) AS quantity
,sum(t.profit) AS profit
,sum(t.quantity*t.price) AS amount
--错误代码
--,sum(t.quantity*sum(t.price) AS amount
,sum(t.profit)/sum(t.quantity*t.price) AS '利润率'
FROM sm.sm_order_total t
GROUP BY t.region,t.province
-- order by仅支持别名
ORDER BY amount desc
--Hive的文本聚合
SELECT
t.category
,concat_ws(',',collect_set(t.subclass)) AS subclass
,sum(t.quantity) AS quantity
FROM sm.sm_order_total t
GROUP BY t.category
)
--已区域分组,统计各产品大类的销售数量
SELECT *
t.region
,case(CASE WHEN t.category='办公' THEN t.quantity ELSE 0 end) AS '办公'
,case(CASE WHEN t.category='技术' THEN t.quantity ELSE 0 end) AS '技术'
,case(CASE WHEN t.category='家具' THEN t.quantity ELSE 0 end) AS '家具'
FROM sm.sm_order_total t
GROUP BY t.region
--以区域分组,统计各产品大类的订单编号数量(去重)
--distinct: 去重关键子段
SELECT *
t.region
,case(DISTINCT CASE WHEN t.category='办公' THEN t.order_id ELSE null end) AS '办公'
,case(DISTINCT CASE WHEN t.category='技术' THEN t.order_id ELSE null end) AS '技术'
,case(DISTINCT CASE WHEN t.category='家具' THEN t.order_id ELSE null end) AS '家具'
FROM sm.sm_order_total t
GROUP BY t.region
--统计个产品大类中销售金额最高的产品子类及对应销售金额
--group by: 分组
--order by: 排序
SELECT
category,subclass,amount
from(
SELECT
t1.category,t1.subclass,t1.amount
--生成唯一的排名序号
,ROW_number()over(PARTITION BY t1.category ORDER BY t1.amount desc) AS rn
from(
SELECT *
t.category
,t.subclass
,round(sum(t.quantity*t.price),2) AS amount
FROM sm.sm_order_total t
GROUP BY t.category,t.subclass
) AS t1
)WHEN rn=1
--组内所有行
--组内首行至当前行
SELECT
t.ORDER_month,t.order_date,t.quantity
,sum(t.quantity)over(PARTITION BY t.order_month) AS '组内所有行'
,sum(t.quantity)over(PARTITION BY t.order_month ORDER BY t.order_date) AS '组内首行至当前行'
FROM sm.sm_order_month t
--组内当前行到尾行: rows bettween current row and unbounded following
--组内当前行的前3行到当前行: rows between 3 preceding and current row
--组内当前行到当前行的后3行: rows between current row and 3 fellowing
SELECT
t.ORDER_month,t.order_date,t.quantity
,sum(t.quantity)OVER w AS 'sum组内所有行'
,sum(t.quantity)OVER w AS 'max组内所有行'
FROM sm.sm_order_month t
WHERE ORDER_month = '2017-01'
ORDER BY order_date
WINDOW w AS
--row_number rank dense_rank
--应用场景: 分组排序,生成序号
--row_number: 生成唯一的序号,不并列不空缺
--rank: 并列的名次时下位空缺
--dense_rank: 并位名词时下位连续
SELECT
t.region,t.province,t.quantity
,ROW_number()over(PARTITION BY t.region ORDER BY t.quantity desc) AS 'row_number'
,rank()over(PARTITION BY t.region)
SELECT
region,province,round(sum(quantity),-2) AS quantity
FROM sm.sm_order_total
GROUP BY region,province
ORDER BY region,province
--应用场景2: 同比.环比的增长率
--lead(), 超前,向前(未来)偏移/向下(表的下方)偏移
--lag(), 滞后,向后(过去)偏移/向上(表的上方)偏移
--第一个参数: 偏移的字段
--第二个参数: 偏移的行数
--第三个参数(可选): 如果返回值为空,以什么值填充
SELECT *
t.order_month AS '订单月份'
,t.quantity AS '当月销量'
,lag(t.quantity,1,0)over(ORDER BY t.order_month) AS '上月销量'
,lag(t.quantity,12,0)over(ORDER BY t.order_month) AS '去年同期销量'
,(t.quantitylag(t.quantity,1,0)over(ORDER BY t.order_month) - 1) AS '环比增长率'
,(t.quantitylag(t.quantity,12,0)over(ORDER BY t.order_month) - 1) AS '同比增长率'
from(
SELECT *
t.order_month,sum(t.quantity) AS quantity
FROM sm.sm_order_month t
GROUP BY
) AS t
--first_value(exper),返回首行的值
--last_value(exper),返回尾行的值,默认窗口范围首行至当前行
--nth_value(expr,n),返回指定行的值,默认窗口范围为首行至当前行
--Hive不支持nth_value
SELECT
category AS '大类'
,subclass AS '产品小类'
,quantity AS '销量'
--组内销量排名第一名的好产品
,first_value(subclass)over(paetition BY category ORDER BY quantity desc) AS '第一的产品子类'
--组内销量排名倒数第一的产品子类
,first_value(subclass)over(paetition BY category ORDER BY quantity asc) AS '倒数第一的产品子类'
--组内销量排名倒数第一的产品子类
,last_value(subclass)over(paetition BY category
ORDER BY quantity DESC
--rows省略首行至当前行
--实际必须首行至尾行
ROW BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED following) AS '倒数第一的产品子类'
FROM(
SELECT
category,subclass,sum(quantity) AS quantity
FROM (sm.sm_order_total
GROUP BY category,subclass
) AS t
)
--应用场景: 数据分组抽样
--ntile(n), 将数据集分为n个桶,返回分桶的序号
SELECT *
from(
SELECT
t.region,t.province,t.quantity
--排序分桶
,ntile(2)over(PARTITION BY region ORDER BY quantity desc) AS 'ntile_desc_2'
,ntile(2)over(PARTITION BY region ORDER BY rand() desc) AS 'ntile_desc_2'
from(
SELECT
region,provinve,sum(quantity) AS quantity
FROM sm.sm_order_total
GROUP BY region,province
) AS t
) AS t
WHERE t.ntile_desc_2=1
SELECT *
t.region,t.province,sum(t.quantition) AS quantity
FROM sm.sm_order_total t
GROUP BY t.region,t.province WITH ROLLUP;
SELECT *
t.region,t.province,sum(t.quantition) AS quantity
FROM sm.sm_order_total t
GROUP BY t.region,t.province WITH cube;
SELECT *
t.region,t.province,sum(t.quantition) AS quantity
,grouping__id
FROM sm.sm_order_total t
GROUP BY t.region,t.province GROUPING SETS(t.region,(t.region,t.province));
Hive的基本语法
于 2022-10-12 19:36:06 首次发布