目录
表生成函数
集合函数
--集合函数
select sort_array(array('y','z','q'));
size(Map<K,V>)--返回一个值
map_keys(Map<K,V>)--返回一个数组
map_values(Map<K,V>)--返回一个数组
条件控制函数
--条件控制函数(case when)
--查询用户id,name,age(如果年龄30以下,显示青年人,30-40,显示中年人,40以上,老年人)
select id,name,info.age,
case
when info.age<30 then '青年'
when info.age<40 and info.age>30 then '中年人'
else '老年人'
end
from t_user;
+-----+-----------+------+------+--+
| id | name | age | _c3 |
+-----+-----------+------+------+--+
| 1 | zhangsan | 18 | 青年 |
| 2 | lisi | 28 | 青年 |
| 3 | wangwu | 38 | 中年人 |
| 4 | 赵六 | 26 | 青年 |
| 5 | 钱琪 | 35 | 中年人 |
| 6 | 王八 | 48 | 老年人 |
+-----+-----------+------+------+--+
--IF
--需求,如果主演中有徐峥的就是好片,否则不是
select movie_name,
if(array_contains(actors,'徐峥'),'好片','不是好片'),
first_show
from t_movie;
+-------------+-------+-------------+--+
| movie_name | _c1 | first_show |
+-------------+-------+-------------+--+
| 无名之辈 | 不是好片 | 2018-11-16 |
| 我不是药神 | 好片 | 2018-07-05 |
| 一出好戏 | 不是好片 | 2018-08-10 |
| 中国机长 | 不是好片 | 2018-05-18 |
| 囧妈 | 好片 | 2020-01-25 |
+-------------+-------+-------------+--+
分析函数
--分析函数
--row_number() over()函数:分组TOPN
--有如下数据 vi row_number.dat
1,18,a,male
2,19,b,male
3,22,c,female
4,16,d,female
5,30,e,male
6,26,f,female
--建表
create table t_rn(id int,age int,name string,sex string)
row format delimited fields terminated by ',';
--导数据
load data local inpath '/root/hivetest/row_number.dat' into table t_rn;
--查询出每种性别中年龄最大的两条数据(思路:分组——排序——标记序号)
select id,name,age,sex,
row_number() over(partition by sex order by age desc) as rn
from t_rn;
+-----+-------+------+---------+-----+--+
| id | name | age | sex | rn |
+-----+-------+------+---------+-----+--+
| 6 | f | 26 | female | 1 |
| 3 | c | 22 | female | 2 |
| 4 | d | 16 | female | 3 |
| 5 | e | 30 | male | 1 |
| 2 | b | 19 | male | 2 |
| 1 | a | 18 | male | 3 |
+-----+-------+------+---------+-----+--+
select id,name,age,sex
from
(select id,name,age,sex,
row_number() over(partition by sex order by age desc) as rn
from t_rn) tmp
where rn<3;
+-----+-------+------+---------+--+
| id | name | age | sex |
+-----+-------+------+---------+--+
| 6 | f | 26 | female |
| 3 | c | 22 | female |
| 5 | e | 30 | male |
| 2 | b | 19 | male |
+-----+-------+------+---------+--+
级联报表查询
--级联报表查询
--数据 vi accu.dat
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
...
--建表
create table t_access_times(name string,month string,times int)
row format delimited fields terminated by ',';
--导数据
load data local inpath '/root/hivetest/accu.dat' into table t_access_times;
--查询每个人当月累计次数
--1.先查出每个人每个月的次数并存成一张新表(总额表)
create table t_access_number
as
select name,month,sum(times) as amount
from t_access_times
group by name,month;
+-----------------------+------------------------+-------------------------+--+
| t_access_number.name | t_access_number.month | t_access_number.amount |
+-----------------------+------------------------+-------------------------+--+
| A | 2015-01 | 33 |
| A | 2015-02 | 10 |
| A | 2015-03 | 20 |
| B | 2015-01 | 30 |
| B | 2015-02 | 15 |
| B | 2015-03 | 45 |
| C | 2015-01 | 30 |
| C | 2015-02 | 40 |
| C | 2015-03 | 30 |
+-----------------------+------------------------+-------------------------+--+
--2.将新表(总额表)自连接
select name,month,sum(b_amount) as accumulate
from
(select a.name as name,a.month as month,b.amount as b_amount
from t_access_number a left join t_access_number b
on a.name=b.name
where b.month<=a.month) tmp
group by name,month;
+-------+----------+-------------+--+
| name | month | accumulate |
+-------+----------+-------------+--+
| A | 2015-01 | 33 |
| A | 2015-02 | 43 |
| A | 2015-03 | 63 |
| B | 2015-01 | 30 |
| B | 2015-02 | 45 |
| B | 2015-03 | 90 |
| C | 2015-01 | 30 |
| C | 2015-02 | 70 |
| C | 2015-03 | 100 |
+-------+----------+-------------+--+
窗口分析函数
--窗口分析函数
--求出每个人截至到每个月的总额
--sum()over()函数:可以实现窗口中进行逐行累加
select name,month,amount,
sum(amount)over(partition by name order by month rows between unbounded preceding and current row) as accumlate
from t_access_number;
+-------+----------+---------+------------+--+
| name | month | amount | accumlate |
+-------+----------+---------+------------+--+
| A | 2015-01 | 33 | 33 |
| A | 2015-02 | 10 | 43 |
| A | 2015-03 | 20 | 63 |
| B | 2015-01 | 30 | 30 |
| B | 2015-02 | 15 | 45 |
| B | 2015-03 | 45 | 90 |
| C | 2015-01 | 30 | 30 |
| C | 2015-02 | 40 | 70 |
| C | 2015-03 | 30 | 100 |
+-------+----------+---------+------------+--+
hive 自定义函数
--hive 自定义函数
/*
有如下json数据:rating.json
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
需要导入hive中进行数据分析
*/
--建表映射上述数据
create table t_ratingjson(json string);
--导数据
load data local inpath '/root/hivetest/rating.json' into table t_ratingjson;
--想把上面的原始数据变成如下形式:
1193,5,978300760,1
661,3,978302109,1
914,3,978301968,1
3408,4,978300275,1
--思路:如果能够定义一个json解析函数
create table t_rate
as
select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson;
select * from t_rate;
+---------------+--------------+------------+-------------+--+
| t_rate.movie | t_rate.rate | t_rate.ts | t_rate.uid |
+---------------+--------------+------------+-------------+--+
| 1193 | 5 | 978300760 | 1 |
| 661 | 3 | 978302109 | 1 |
| 914 | 3 | 978301968 | 1 |
| 3408 | 4 | 978300275 | 1 |
| 2355 | 5 | 978824291 | 1 |
| 1197 | 3 | 978302268 | 1 |
| 1287 | 5 | 978302039 | 1 |
| 2804 | 5 | 978300719 | 1 |
| 594 | 4 | 978302268 | 1 |
| 919 | 4 | 978301368 | 1 |
--解决:
--hive中如何定义自己的函数:
--1、先写一个java类(extends UDF,重载方法public C evaluate(A a,B b)),实现你所想要的函数的功能(传入一个json字符串和一个脚标,返回一个值)
--2、将java程序打成jar包,上传到hive所在的机器
--3、在hive命令行中将jar包添加到classpath :
hive>add jar /root/hivetest/myjson.jar;
--4、在hive命令中用命令创建一个函数叫做myjson,关联你所写的这个java类
hive> create temporary function myjson as 'cn.hive.json.MyJsonParser';
--查询每个人评论过几部电影
select uid,count(1)
from t_rate
group by uid limit 20;
+-------+-------+--+
| uid | _c1 |
+-------+-------+--+
| 1 | 53 |
| 10 | 401 |
| 100 | 76 |
| 1000 | 84 |
| 1001 | 377 |
| 1002 | 66 |
| 1003 | 29 |
| 1004 | 481 |
| 1005 | 92 |
| 1006 | 44 |
| 1007 | 32 |
| 1008 | 50 |
| 1009 | 52 |
| 101 | 106 |
| 1010 | 1004 |
json解析函数
--json解析函数
select json_tuple(json,'movie','rate','timeStamp','uid')
as (movie,rate,time,uid)
from t_ratingjson limit 20;
+--------+-------+------------+------+--+
| movie | rate | time | uid |
+--------+-------+------------+------+--+
| 1193 | 5 | 978300760 | 1 |
| 661 | 3 | 978302109 | 1 |
| 914 | 3 | 978301968 | 1 |
| 3408 | 4 | 978300275 | 1 |
| 2355 | 5 | 978824291 | 1 |
| 1197 | 3 | 978302268 | 1 |
| 1287 | 5 | 978302039 | 1 |