hive基础语法示例

create table t_a(name string,numb int)
row format delimited
fields terminated by ‘,’;

create table t_b(name string,nick string)
row format delimited
fields terminated by ‘,’;

load data local inpath ‘/root/hivetest/a.txt’ into table t_a;
load data local inpath ‘/root/hivetest/b.txt’ into table t_b;

– 各类join
–1/ 内连接
– 笛卡尔积
select a.,b.
from t_a a inner join t_b b;

– 指定join条件
select a.,b.
from
t_a a join t_b b on a.name=b.name;

– 2/ 左外连接(左连接)
select a.,b.
from
t_a a left outer join t_b b on a.name=b.name;

– 3/ 右外连接(右连接)
select a.,b.
from
t_a a right outer join t_b b on a.name=b.name;

– 4/ 全外连接
select a.,b.
from
t_a a full outer join t_b b on a.name=b.name;

– 5/ 左半连接
select a.*
from
t_a a left semi join t_b b on a.name=b.name;

– 分组聚合查询

– 针对每一行进行运算
select ip,upper(url),access_time – 该表达式是对数据中的每一行进行逐行运算
from t_pv_log;

– 求每条URL的访问总次数

select url,count(1) as cnts – 该表达式是对分好组的数据进行逐组运算
from t_pv_log
group by url;

– 求每个URL的访问者中ip地址最大的

select url,max(ip)
from t_pv_log
group by url;

– 求每个用户访问同一个页面的所有记录中,时间最晚的一条

select ip,url,max(access_time)
from t_pv_log
group by ip,url;

– 分组聚合综合示例
– 有如下数据
/*
192.168.33.3,http://www.edu360.cn/stu,2017-08-04 15:30:20
192.168.33.3,http://www.edu360.cn/teach,2017-08-04 15:35:20
192.168.33.4,http://www.edu360.cn/stu,2017-08-04 15:30:20
192.168.33.4,http://www.edu360.cn/job,2017-08-04 16:30:20
192.168.33.5,http://www.edu360.cn/job,2017-08-04 15:40:20

192.168.33.3,http://www.edu360.cn/stu,2017-08-05 15:30:20
192.168.44.3,http://www.edu360.cn/teach,2017-08-05 15:35:20
192.168.33.44,http://www.edu360.cn/stu,2017-08-05 15:30:20
192.168.33.46,http://www.edu360.cn/job,2017-08-05 16:30:20
192.168.33.55,http://www.edu360.cn/job,2017-08-05 15:40:20

192.168.133.3,http://www.edu360.cn/register,2017-08-06 15:30:20
192.168.111.3,http://www.edu360.cn/register,2017-08-06 15:35:20
192.168.34.44,http://www.edu360.cn/pay,2017-08-06 15:30:20
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20
192.168.33.25,http://www.edu360.cn/job,2017-08-06 15:40:20
192.168.33.36,http://www.edu360.cn/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20

*/
– 建表映射上述数据
create table t_access(ip string,url string,access_time string)
partitioned by (dt string)
row format delimited fields terminated by ‘,’;

– 导入数据
load data local inpath ‘/root/hivetest/access.log.0804’ into table t_access partition(dt=‘2017-08-04’);
load data local inpath ‘/root/hivetest/access.log.0805’ into table t_access partition(dt=‘2017-08-05’);
load data local inpath ‘/root/hivetest/access.log.0806’ into table t_access partition(dt=‘2017-08-06’);

– 查看表的分区
show partitions t_access;

– 求8月4号以后,每天http://www.edu360.cn/job的总访问次数,及访问者中ip地址中最大的
select dt,‘http://www.edu360.cn/job’,count(1),max(ip)
from t_access
where url=‘http://www.edu360.cn/job
group by dt having dt>‘2017-08-04’;

select dt,max(url),count(1),max(ip)
from t_access
where url=‘http://www.edu360.cn/job
group by dt having dt>‘2017-08-04’;

select dt,url,count(1),max(ip)
from t_access
where url=‘http://www.edu360.cn/job
group by dt,url having dt>‘2017-08-04’;

select dt,url,count(1),max(ip)
from t_access
where url=‘http://www.edu360.cn/job’ and dt>‘2017-08-04’
group by dt,url;

– 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的

select dt,url,count(1),max(ip)
from t_access
where dt>‘2017-08-04’
group by dt,url;

– 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且,只查询出总访问次数>2 的记录
– 方式1:
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>‘2017-08-04’
group by dt,url having cnts>2;

– 方式2:用子查询
select dt,url,cnts,max_ip
from
(select dt,url,count(1) as cnts,max(ip) as max_ip
from t_access
where dt>‘2017-08-04’
group by dt,url) tmp
where cnts>2;

±---------------±--------------------------------±----------------------±-------------±-+
| t_access.ip | t_access.url | t_access.access_time | t_access.dt |
±---------------±--------------------------------±----------------------±-------------±-+

| 192.168.33.46 | http://www.edu360.cn/job | 2017-08-05 16:30:20 | 2017-08-05 |
| 192.168.33.55 | http://www.edu360.cn/job | 2017-08-05 15:40:20 | 2017-08-05 |

| 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2017-08-06 |
| 192.168.33.25 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2017-08-06 |
| 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2017-08-06 |
±---------------±--------------------------------±----------------------±-------------+

/*
HIVE 中的复合数据类型

*/
– 数组
– 有如下数据:
战狼2,吴京:吴刚:龙母,2017-08-16
三生三世十里桃花,刘亦菲:痒痒,2017-08-20
普罗米修斯,苍老师:小泽老师:波多老师,2017-09-17
美女与野兽,吴刚:加藤鹰,2017-09-17

– 建表映射:
create table t_movie(movie_name string,actors array,first_show date)
row format delimited fields terminated by ‘,’
collection items terminated by ‘:’;

– 导入数据
load data local inpath ‘/root/hivetest/actor.dat’ into table t_movie;
load data local inpath ‘/root/hivetest/actor.dat.2’ into table t_movie;

– 查询
select movie_name,actors[0],first_show from t_movie;

select movie_name,actors,first_show
from t_movie where array_contains(actors,‘吴刚’);

select movie_name
,size(actors) as actor_number
,first_show
from t_movie;

– 有如下数据:
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26

– 建表映射上述数据
create table t_family(id int,name string,family_members map<string,string>,age int)
row format delimited fields terminated by ‘,’
collection items terminated by ‘#’
map keys terminated by ‘:’;

– 导入数据
load data local inpath ‘/root/hivetest/fm.dat’ into table t_family;

±-------------±---------------±---------------------------------------------------------------±--------------±-+
| t_family.id | t_family.name | t_family.family_members | t_family.age |
±-------------±---------------±---------------------------------------------------------------±--------------±-+
| 1 | zhangsan | {“father”:“xiaoming”,“mother”:“xiaohuang”,“brother”:“xiaoxu”} | 28 |
| 2 | lisi | {“father”:“mayun”,“mother”:“huangyi”,“brother”:“guanyu”} | 22 |
| 3 | wangwu | {“father”:“wangjianlin”,“mother”:“ruhua”,“sister”:“jingtian”} | 29 |
| 4 | mayun | {“father”:“mayongzhen”,“mother”:“angelababy”} | 26 |
±-------------±---------------±---------------------------------------------------------------±--------------±-+

– 查出每个人的 爸爸、姐妹
select id,name,family_members[“father”] as father,family_members[“sister”] as sister,age
from t_family;

– 查出每个人有哪些亲属关系
select id,name,map_keys(family_members) as relations,age
from t_family;

– 查出每个人的亲人名字
select id,name,map_values(family_members) as relations,age
from t_family;

– 查出每个人的亲人数量
select id,name,size(family_members) as relations,age
from t_family;

– 查出所有拥有兄弟的人及他的兄弟是谁
– 方案1:一句话写完
select id,name,age,family_members[‘brother’]
from t_family where array_contains(map_keys(family_members),‘brother’);

– 方案2:子查询
select id,name,age,family_members[‘brother’]
from
(select id,name,age,map_keys(family_members) as relations,family_members
from t_family) tmp
where array_contains(relations,‘brother’);

/* hive数据类型struct

假如有以下数据:
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,赵六,26:female:上海
5,钱琪,35:male:杭州
6,王八,48:female:南京
*/

– 建表映射上述数据

drop table if exists t_user;
create table t_user(id int,name string,info structage:int,sex:string,addr:string)
row format delimited fields terminated by ‘,’
collection items terminated by ‘:’;

– 导入数据
load data local inpath ‘/root/hivetest/user.dat’ into table t_user;

– 查询每个人的id name和地址
select id,name,info.addr
from t_user;

/*
条件控制函数:case when
*/

0: jdbc:hive2://localhost:10000> select * from t_user;
±-----------±-------------±---------------------------------------±-+
| t_user.id | t_user.name | t_user.info |
±-----------±-------------±---------------------------------------±-+
| 1 | zhangsan | {“age”:18,“sex”:“male”,“addr”:“深圳”} |
| 2 | lisi | {“age”:28,“sex”:“female”,“addr”:“北京”} |
| 3 | wangwu | {“age”:38,“sex”:“male”,“addr”:“广州”} |
| 4 | 赵六 | {“age”:26,“sex”:“female”,“addr”:“上海”} |
| 5 | 钱琪 | {“age”:35,“sex”:“male”,“addr”:“杭州”} |
| 6 | 王八 | {“age”:48,“sex”:“female”,“addr”:“南京”} |
±-----------±-------------±---------------------------------------±-+

需求:查询出用户的id、name、年龄(如果年龄在30岁以下,显示年轻人,30-40之间,显示中年人,40以上老年人)
select id,name,
case
when info.age<30 then ‘青年’
when info.age>=30 and info.age<40 then ‘中年’
else ‘老年’
end
from t_user;

— IF
0: jdbc:hive2://localhost:10000> select * from t_movie;
±--------------------±-----------------------±--------------------±-+
| t_movie.movie_name | t_movie.actors | t_movie.first_show |
±--------------------±-----------------------±--------------------±-+
| 战狼2 | [“吴京”,“吴刚”,“龙母”] | 2017-08-16 |
| 三生三世十里桃花 | [“刘亦菲”,“痒痒”] | 2017-08-20 |
| 普罗米修斯 | [“苍老师”,“小泽老师”,“波多老师”] | 2017-09-17 |
| 美女与野兽 | [“吴刚”,“加藤鹰”] | 2017-09-17 |
±--------------------±-----------------------±--------------------±-+

– 需求: 查询电影信息,并且如果主演中有吴刚的,显示好电影,否则烂片

select movie_name,actors,first_show,
if(array_contains(actors,‘吴刚’),‘好片儿’,‘烂片儿’)
from t_movie;

– row_number() over() 函数
– 造数据:

1,18,a,male
2,19,b,male
3,22,c,female
4,16,d,female
5,30,e,male
6,26,f,female

create table t_rn(id int,age int,name string,sex string)
row format delimited fields terminated by ‘,’;

load data local inpath ‘/root/hivetest/rn.dat’ into table t_rn;

– 分组标记序号

select *
from
(select id,age,name,sex,
row_number() over(partition by sex order by age desc) as rn
from t_rn) tmp
where rn<3
;

– 窗口分析函数 sum() over() :可以实现在窗口中进行逐行累加

0: jdbc:hive2://localhost:10000> select * from t_access_amount;
±---------------------±-----------------------±------------------------±-+
| t_access_amount.uid | t_access_amount.month | t_access_amount.amount |
±---------------------±-----------------------±------------------------±-+
| A | 2015-01 | 33 |
| A | 2015-02 | 10 |
| A | 2015-03 | 20 |
| B | 2015-01 | 30 |
| B | 2015-02 | 15 |
| B | 2015-03 | 45 |
| C | 2015-01 | 30 |
| C | 2015-02 | 40 |
| C | 2015-03 | 30 |
±---------------------±-----------------------±------------------------±-+

– 需求:求出每个人截止到每个月的总额

select uid,month,amount,
sum(amount) over(partition by uid order by month rows between unbounded preceding and current row) as accumulate
from t_access_amount;

– 自定义函数
/*
有如下json数据:rating.json
{“movie”:“1193”,“rate”:“5”,“timeStamp”:“978300760”,“uid”:“1”}
{“movie”:“661”,“rate”:“3”,“timeStamp”:“978302109”,“uid”:“1”}
{“movie”:“914”,“rate”:“3”,“timeStamp”:“978301968”,“uid”:“1”}
{“movie”:“3408”,“rate”:“4”,“timeStamp”:“978300275”,“uid”:“1”}

需要导入hive中进行数据分析
*/

– 建表映射上述数据
create table t_ratingjson(json string);

load data local inpath ‘/root/hivetest/rating.json’ into table t_ratingjson;

想把上面的原始数据变成如下形式:
1193,5,978300760,1
661,3,978302109,1
914,3,978301968,1
3408,4,978300275,1

思路:如果能够定义一个json解析函数,则很方便了
create table t_rate
as
select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson;

解决:
hive中如何定义自己的函数:
1、先写一个java类(extends UDF,重载方法public C evaluate(A a,B b)),实现你所想要的函数的功能(传入一个json字符串和一个脚标,返回一个值)
2、将java程序打成jar包,上传到hive所在的机器
3、在hive命令行中将jar包添加到classpath :
hive>add jar /root/hivetest/myjson.jar;
4、在hive命令中用命令创建一个函数叫做myjson,关联你所写的这个java类
hive> create temporary function myjson as ‘cn.edu360.hive.udf.MyJsonParser’;

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值