Hive行列转换、开窗、自定义函数
行列转换、开窗、自定义函数
1、数据准备
数据1
+---------------+------------------+--------------+
| emp_sex.name | emp_sex.dept_id | emp_sex.sex |
+---------------+------------------+--------------+
| 悟空 | A | 男 |
| 大海 | A | 男 |
| 宋宋 | B | 男 |
| 凤姐 | A | 女 |
| 婷姐 | B | 女 |
| 婷婷 | B | 女 |
+---------------+------------------+--------------+
悟空 A 男
大海 A 男
宋宋 B 男
凤姐 A 女
婷姐 B 女
婷婷 B 女
create table emp_sex(
name string,
dept_id string,
sex string)
row format delimited fields terminated by "\t";
load data local inpath '/opt/module/hive/datas/emp_sex.txt' into table emp_sex;
数据2
+-------------------+----------------------------+-------------------------+
| person_info.name | person_info.constellation | person_info.blood_type |
+-------------------+----------------------------+-------------------------+
| 孙悟空 | 白羊座 | A |
| 大海 | 射手座 | A |
| 宋宋 | 白羊座 | B |
| 猪八戒 | 白羊座 | A |
| 凤姐 | 射手座 | A |
| 苍老师 | 白羊座 | B |
+-------------------+----------------------------+-------------------------+
孙悟空 白羊座 A
大海 射手座 A
宋宋 白羊座 B
猪八戒 白羊座 A
凤姐 射手座 A
苍老师 白羊座 B
create table person_info(name string,constellation string,blood_type string )
row format delimited fields terminated by '\t';
load data local inpath '/home/fang/data/constellation.txt' into table person_info ;
数据3
+-------------------+----------------------+
| movie_info.movie | movie_info.category |
+-------------------+----------------------+
| 《疑犯追踪》 | 悬疑,动作,科幻,剧情 |
| 《Lie to me》 | 悬疑,警匪,动作,心理,剧情 |
| 《战狼2》 | 战争,动作,灾难 |
+-------------------+----------------------+
《疑犯追踪》 悬疑,动作,科幻,剧情
《Lie to me》 悬疑,警匪,动作,心理,剧情
《战狼2》 战争,动作,灾难
create table movie_info (movie string,category string)
row format delimited fields terminated by '\t';
load data local inpath '/home/fang/data/movie.txt' into table movie_info;
数据4
+----------------+---------------------+----------------+
| business.name | business.orderdate | business.cost |
+----------------+---------------------+----------------+
| jack | 2017-01-01 | 10 |
| tony | 2017-01-02 | 15 |
| jack | 2017-02-03 | 23 |
| tony | 2017-01-04 | 29 |
| jack | 2017-01-05 | 46 |
| jack | 2017-04-06 | 42 |
| tony | 2017-01-07 | 50 |
| jack | 2017-01-08 | 55 |
| mart | 2017-04-08 | 62 |
| mart | 2017-04-09 | 68 |
| neil | 2017-05-10 | 12 |
| mart | 2017-04-11 | 75 |
| neil | 2017-06-12 | 80 |
| mart | 2017-04-13 | 94 |
+----------------+---------------------+----------------+
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
create table business (name string,orderdate string,cost int)
row format delimited fields terminated by ',';
load data local inpath '/home/fang/data/business.txt' into table business;
数据5
+-------------+----------------+--------------+
| score.name | score.subject | score.score |
+-------------+----------------+--------------+
| 孙悟空 | 语文 | 87 |
| 孙悟空 | 数学 | 95 |
| 孙悟空 | 英语 | 68 |
| 大海 | 语文 | 94 |
| 大海 | 数学 | 56 |
| 大海 | 英语 | 84 |
| 宋宋 | 语文 | 64 |
| 宋宋 | 数学 | 86 |
| 宋宋 | 英语 | 84 |
| 婷婷 | 语文 | 65 |
| 婷婷 | 数学 | 85 |
| 婷婷 | 英语 | 78 |
+-------------+----------------+--------------+
孙悟空 数学 95
宋宋 数学 86
婷婷 数学 85
大海 数学 56
宋宋 英语 84
大海 英语 84
婷婷 英语 78
孙悟空 英语 68
大海 语文 94
孙悟空 语文 87
婷婷 语文 65
宋宋 语文 64
create table score(
name string,
subject string,
score int)
row format delimited fields terminated by "\t";
load data local inpath '/home/fang/data/score.txt' into table score;
2、case when then else end
查询每个部门男和女各多少人(emp_sex表)
结果:
+----------+------+--------+
| dept_id | man | woman |
+----------+------+--------+
| A | 2 | 1 |
| B | 1 | 2 |
+----------+------+--------+
select dept_id,
count(case sex when '男' then 1 else NULL end),
count(case sex when '女' then 1 else NULL end)
from emp_sex group by dept_id;
select dept_id,
sum(case sex when '男' then 1 else 0 end) man,
sum(case sex when '女' then 1 else 0 end) woman
from emp_sex group by dept_id;
3、行转列
把星座和血型一样的人归类到一起(person_info表)。结果如下:
+----------+--------+
| _c0 | cb |
+----------+--------+
| 大海|凤姐 | 射手座,A |
| 孙悟空|猪八戒 | 白羊座,A |
| 宋宋|苍老师 | 白羊座,B |
+----------+--------+
--第一步
select concat_WS(',',constellation,blood_type) a,name
from person_info t;
--最终答案
select t.a,concat_ws("-",collect_set(name)) from
(select concat_WS('|',constellation,blood_type) a,name
from person_info ) as t group by t.a;
4、列转行
将电影分类中的数组数据展开。结果如下:
+--------------+----------------+
| movie | category_name |
+--------------+----------------+
| 《疑犯追踪》 | 悬疑 |
| 《疑犯追踪》 | 动作 |
| 《疑犯追踪》 | 科幻 |
| 《疑犯追踪》 | 剧情 |
| 《Lie to me》 | 悬疑 |
| 《Lie to me》 | 警匪 |
| 《Lie to me》 | 动作 |
| 《Lie to me》 | 心理 |
| 《Lie to me》 | 剧情 |
| 《战狼2》 | 战争 |
| 《战狼2》 | 动作 |
| 《战狼2》 | 灾难 |
+--------------+----------------+
--第一步
select explode(split(category,',')) from movie_info;
--答案
select movie,name
from movie_info
lateral view
explode(split(category,',')) lve as name;
5、窗口函数
查询在2017年4月份购买过的顾客及总人数
--方式1
select name,count(*) over()
from business
where month(orderdate) =4
group by name;
--方式2
select name,count(name) over()
from business
where date_format(orderdate,'yyyy-MM')='2017-04'
group by name
查询顾客的购买明细及月购买总额
select name,orderdate,cost,sum(cost) over (partition by name,date_format(orderdate,'yyyy-MM') order by date_format(orderdate,'yyyy-MM'))
from business
上述的场景, 将每个顾客的cost按照日期进行累加
--方式1
select name,orderdate,cost,
sum(cost) over(partition by name,month(orderdate) order by orderdate) simple,
sum(cost) over(partition by name,month(orderdate) order by orderdate rows between unbounded preceding and current row) notsimple
from business;
--方式2
select name,orderdate,cost,
sum(cost) over (partition by name,date_format(orderdate,'yyyy-MM') order by date_format(orderdate,'yyyy-MM')) s1,
sum(cost) over (partition by name order by date_format(orderdate,'yyyy-MM-dd')) s2
from business
查询顾客购买明细以及上次的购买时间和下次购买时间
select name,orderdate,cost,
lag(orderdate,1,"空") over (partition by name order by orderdate ) pt,
lead(orderdate,1,"空") over (partition by name order by orderdate ) nt
from business;
查询顾客每个月第一次的购买时间 和 每个月的最后一次购买时间
select name,orderdate,cost,
first_value(orderdate,false) over (partition by name,month(orderdate) order by orderdate rows between unbounded preceding and unbounded following) ft,
last_value(orderdate,false) over (partition by name,month(orderdate) order by orderdate rows between unbounded preceding and unbounded following) lt
from business;
查询前20%时间的订单信息
select name,orderdate,cost from (
select name,orderdate,cost,
ntile(5) over (order by date_format(orderdate,'yyyy-MM-dd')) ntil
from business
) t1
where t1.ntil = 1
6、rank
计算每门学科成绩排名。
select name,subject,score,
rank() over(partition by subject order by score) rank,
dense_rank() over(partition by subject order by score) dense_rank,
row_number() over(partition by subject order by score) row_number
from score;
7、添加自定义函数
--添加自己写的jar包
add jar /home/data/xx.jar;
--创建函数
create temporary function myLen as 'com.yire.java.UDFTest';
--删除函数
drop temporary function myLen;
java代码:
package com.yire.java;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class UDFTest extends GenericUDF {
/**
* 校验
*
* @param objectInspectors
* @return
* @throws UDFArgumentException
*/
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
if (objectInspectors == null || objectInspectors.length != 1) {
throw new UDFArgumentLengthException("参数只能是一个");
}
ObjectInspector objectInspector = objectInspectors[0];
if (!objectInspector.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
throw new UDFArgumentTypeException(0, "参数类型错误,只能是基本类型");
}
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
DeferredObject deferredObject = deferredObjects[0];
//只是简单的将单词的长度返回
return deferredObject.get().toString().length();
}
public String getDisplayString(String[] strings) {
return "";
}
}