Hive的基本语法

SELECT * 
FROM edu.score
distribute BY rand()
sort BY s_score;

# 全局排序
SELECT * FROM eedu.score
ORDER BY rand()

# 按比例抽样
SELECT * FROM edu.score tablesample(10 precent)

# 按大小抽样,单位为字节
SELECT * FROM edu,score tablesample(20 0B)

# ORDER BY 全局排序: 谨慎使用,占用内存空间比较大
SELECT * FROM edu.score
ORDER BY rand()
LIMIT 10

# 分桶抽样: 随机分成10组,抽取其中一组数据
SELECT * FROM edu.score tablesample()




CREATE TABLE test.oerson(
	id int,
	name string,
	--array数组/列表
	email arry <string>,
	--map字段k-v
	score MAP<string,int>
	--struct结构体/表单
	addr struct<province:string,city:string,city_type:int>
--设置分隔符的语句
ROW format delimited

--字段之间的分隔符为制表符
fields terminated BY '\t'

--符合数据类型: struct\map\array
--符合数据类型字段的各项的分隔符
collection items terminated BY ','

--map键值对key与value的分隔符
MAP keys terminated BY ':'

--行之间的分隔符为换行符
lines terminated BY '\n'

load DATA local inpath 'data/import/person.text'
INTO TABLE test.person;

SELECT 
	id,
	name,
	email[0] AS email_1,
	email[1] AS email_2,
	score['语文'] AS ch,
	score['数学'] AS math,
	coalesce['英语',0] AS en,
	addr.province,
	addr.city,
	addr.city_type
FROM test.person;

--题目: 统计各产品子类的退货数量.退货率
SELECT * FROM sm.sm_order_detail sod
LEFT join sm.sm_order_info soi ON soi.order_id = sod.order_id
LEFT join sm.sm_customer_info sci ON sci.customer_id = soi.customer_id
LEFT join sm.sm_product_info spi ON spi.product_id = spd.product_id
LEFT JOIN sm.sm_region_info sri ON sri.region_id = soi.region_id
LEFT JOIN sm.sm_return_info sri2 ON sri2.order_id = sod.order_id
	AND sri2.product_id = sod.product_id


SELECT 
	t.subclass,
	,sum(t.quantity) AS '销售数量'
	--条件求值
	,sum(CASE WHEN t.return_data IS NOT THEN t.quantity ELSE 0 END) AS '退货数量'
	,sum(CASE WHEN t.return_data IS NOT THEN t.quantity ELSE 0 END)/sum(t.quantity)
FROM sm.sm_order_total t
GROUP BY t.subclass;

--题目: 筛选年龄不等于30的订单
--比较运算符的筛选不含null
SELECT *
FROM sm.sm_order_total
WHERE age <> 30 OR age IS NULL;

--以区域.省份分组,统计销售数量,销售金额,利润金额,并以销售金额降序
SELECT 
	t.region
	,t.province
	,sum(t.quantity) AS quantity
	,sum(t.profit) AS profit
	,sum(t.quantity*t.price) AS amount
	--错误代码
	--,sum(t.quantity*sum(t.price) AS amount
	,sum(t.profit)/sum(t.quantity*t.price) AS '利润率'
FROM sm.sm_order_total t
GROUP BY t.region,t.province
-- order by仅支持别名
ORDER BY amount desc

--Hive的文本聚合
SELECT
	t.category
	,concat_ws(',',collect_set(t.subclass)) AS subclass
	,sum(t.quantity) AS quantity
FROM sm.sm_order_total t
GROUP BY t.category
)


--已区域分组,统计各产品大类的销售数量
SELECT * 
	t.region
	,case(CASE WHEN t.category='办公' THEN t.quantity ELSE 0 end) AS '办公'
	,case(CASE WHEN t.category='技术' THEN t.quantity ELSE 0 end) AS '技术'
	,case(CASE WHEN t.category='家具' THEN t.quantity ELSE 0 end) AS '家具'
FROM sm.sm_order_total t
GROUP BY t.region

--以区域分组,统计各产品大类的订单编号数量(去重)
--distinct: 去重关键子段
SELECT * 
	t.region
	,case(DISTINCT CASE WHEN t.category='办公' THEN t.order_id ELSE null end) AS '办公'
	,case(DISTINCT CASE WHEN t.category='技术' THEN t.order_id ELSE null end) AS '技术'
	,case(DISTINCT CASE WHEN t.category='家具' THEN t.order_id ELSE null end) AS '家具'
FROM sm.sm_order_total t
GROUP BY t.region

--统计个产品大类中销售金额最高的产品子类及对应销售金额
--group by: 分组
--order by: 排序
SELECT 
	category,subclass,amount
from(
		SELECT 
	t1.category,t1.subclass,t1.amount
	--生成唯一的排名序号
	,ROW_number()over(PARTITION BY t1.category ORDER BY t1.amount desc) AS rn
from(
	SELECT *
	t.category
	,t.subclass
	,round(sum(t.quantity*t.price),2) AS amount
FROM sm.sm_order_total t
GROUP BY t.category,t.subclass
) AS t1
)WHEN rn=1


--组内所有行
--组内首行至当前行
SELECT
	t.ORDER_month,t.order_date,t.quantity
	,sum(t.quantity)over(PARTITION BY t.order_month) AS '组内所有行'
	,sum(t.quantity)over(PARTITION BY t.order_month ORDER BY t.order_date) AS '组内首行至当前行'
FROM sm.sm_order_month t


--组内当前行到尾行: rows bettween current row and unbounded following
--组内当前行的前3行到当前行: rows between 3 preceding and current row
--组内当前行到当前行的后3行: rows between current row and 3 fellowing


SELECT
	t.ORDER_month,t.order_date,t.quantity
	,sum(t.quantity)OVER w AS 'sum组内所有行'
	,sum(t.quantity)OVER w AS 'max组内所有行'
FROM sm.sm_order_month t
WHERE ORDER_month = '2017-01'
ORDER BY order_date
WINDOW w AS   


--row_number rank dense_rank
--应用场景: 分组排序,生成序号
--row_number: 生成唯一的序号,不并列不空缺
--rank: 并列的名次时下位空缺
--dense_rank: 并位名词时下位连续

SELECT 
	t.region,t.province,t.quantity
	,ROW_number()over(PARTITION BY t.region ORDER BY t.quantity desc) AS 'row_number'
	,rank()over(PARTITION BY t.region)

SELECT
	region,province,round(sum(quantity),-2) AS quantity
FROM sm.sm_order_total
GROUP BY region,province
ORDER BY region,province


--应用场景2: 同比.环比的增长率
--lead(), 超前,向前(未来)偏移/向下(表的下方)偏移
--lag(), 滞后,向后(过去)偏移/向上(表的上方)偏移
--第一个参数: 偏移的字段
--第二个参数: 偏移的行数
--第三个参数(可选): 如果返回值为空,以什么值填充
SELECT * 
	t.order_month AS '订单月份'
	,t.quantity AS '当月销量'
	,lag(t.quantity,1,0)over(ORDER BY t.order_month) AS '上月销量'
	,lag(t.quantity,12,0)over(ORDER BY t.order_month) AS '去年同期销量'
	,(t.quantitylag(t.quantity,1,0)over(ORDER BY t.order_month) - 1) AS '环比增长率'
	,(t.quantitylag(t.quantity,12,0)over(ORDER BY t.order_month) - 1) AS '同比增长率'
	from(
	SELECT *
		t.order_month,sum(t.quantity) AS quantity
	FROM sm.sm_order_month t
	GROUP BY 
	) AS t

	
	
--first_value(exper),返回首行的值
--last_value(exper),返回尾行的值,默认窗口范围首行至当前行
--nth_value(expr,n),返回指定行的值,默认窗口范围为首行至当前行
--Hive不支持nth_value
SELECT
	category AS '大类'
	,subclass AS '产品小类'
	,quantity AS '销量'
	--组内销量排名第一名的好产品
	,first_value(subclass)over(paetition BY category ORDER BY quantity desc) AS '第一的产品子类'
	--组内销量排名倒数第一的产品子类
	,first_value(subclass)over(paetition BY category ORDER BY quantity asc) AS '倒数第一的产品子类'
	--组内销量排名倒数第一的产品子类
	,last_value(subclass)over(paetition BY category 
	ORDER BY quantity DESC 
	--rows省略首行至当前行
	--实际必须首行至尾行
	ROW BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED following) AS '倒数第一的产品子类'

FROM(
	SELECT
		category,subclass,sum(quantity) AS quantity
	FROM (sm.sm_order_total
	GROUP BY category,subclass
	) AS t
)


--应用场景: 数据分组抽样
--ntile(n), 将数据集分为n个桶,返回分桶的序号
SELECT * 
from(
	SELECT 
		t.region,t.province,t.quantity
		--排序分桶
		,ntile(2)over(PARTITION BY region ORDER BY quantity desc) AS 'ntile_desc_2'
		,ntile(2)over(PARTITION BY region ORDER BY rand() desc) AS 'ntile_desc_2'

	from(
		SELECT
			region,provinve,sum(quantity) AS quantity
		FROM sm.sm_order_total
		GROUP BY region,province
	) AS t
) AS t
WHERE t.ntile_desc_2=1


SELECT *
	t.region,t.province,sum(t.quantition) AS quantity
FROM sm.sm_order_total t
GROUP BY t.region,t.province WITH ROLLUP;

SELECT *
	t.region,t.province,sum(t.quantition) AS quantity
FROM sm.sm_order_total t
GROUP BY t.region,t.province WITH cube;

SELECT *
	t.region,t.province,sum(t.quantition) AS quantity
	,grouping__id  
FROM sm.sm_order_total t
GROUP BY t.region,t.province GROUPING SETS(t.region,(t.region,t.province));

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值