目录
hive基本查询
--hive基本查询
select * from a;
select max(ip) from a;
select count(*) from a;
hive 条件查询
--hive 条件查询
select *
from t_pv_log
where access_time>'2020-11-26 15:31:33';
关联查询
1.内连接
--1.内连接(笛卡尔积),右边字段依次和左边的连一遍
select t_a.*,t_b.*
from a t_a inner join b t_b;
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| 1 | a | 2 | bb |
| 1 | a | 3 | cc |
| 1 | a | 7 | yy |
| 1 | a | 9 | pp |
| 2 | b | 2 | bb |
| 2 | b | 3 | cc |
| 2 | b | 7 | yy |
| 2 | b | 9 | pp |
| 3 | c | 2 | bb |
| 3 | c | 3 | cc |
| 3 | c | 7 | yy |
| 3 | c | 9 | pp |
| 4 | d | 2 | bb |
| 4 | d | 3 | cc |
| 4 | d | 7 | yy |
| 4 | d | 9 | pp |
| 7 | y | 2 | bb |
| 7 | y | 3 | cc |
| 7 | y | 7 | yy |
| 7 | y | 9 | pp |
| 8 | u | 2 | bb |
| 8 | u | 3 | cc |
| 8 | u | 7 | yy |
| 8 | u | 9 | pp |
+---------+-----------+---------+-----------+--+
--指定条件
select t_a.*,t_b.*
from a t_a inner join b t_b
on t_a.id=t_b.id;
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 7 | y | 7 | yy |
+---------+-----------+---------+-----------+--+
2.左连接
--2.左外连接(左连接),左边字段依次和右边的连一遍
select a.*,b.*
from a left join b
on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | a | NULL | NULL |
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 4 | d | NULL | NULL |
| 7 | y | 7 | yy |
| 8 | u | NULL | NULL |
+-------+---------+-------+---------+--+
--加条件:保留左边所有字段,右边没有符合左边字段的就为null
3.右连接
--3.右外连接(右连接),不加条件和内联接查询结果相同
select a.*,b.*
from a right join b
on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 7 | y | 7 | yy |
| NULL | NULL | 9 | pp |
+-------+---------+-------+---------+--+
--加条件:保留右边所有字段,左边没有符合右边字段的就为null
4.全外连接
--4.全外连接
select a.*,b.*
from a full outer join b
on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | a | NULL | NULL |
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 4 | d | NULL | NULL |
| 7 | y | 7 | yy |
| 8 | u | NULL | NULL |
| NULL | NULL | 9 | pp |
+-------+---------+-------+---------+--+
--加条件:左边右边字段都会保存
5.左半连接(hive独有)
--5.左半连接(hive独有)
select a.*
from a left semi join b
on a.id=b.id;
+-------+---------+--+
| a.id | a.name |
+-------+---------+--+
| 2 | b |
| 3 | c |
| 7 | y |
+-------+---------+--+
--查询得到与右表相同字段数据的左表数据(故select子句中不能有右表字段)
group by 分组聚合查询
select upper("abcde");--转大写函数
-- 针对每一行计算,指定字段的每行数据都会变
select ip,upper(url),access_time
from t_pv_log;
--求每条url的访问总次数
+---------------+--------------------+-----------------------+---------------+--+
| t_pv_log.ip | t_pv_log.url | t_pv_log.access_time | t_pv_log.day |
+---------------+--------------------+-----------------------+---------------+--+
| 192.168.22.1 | http://sina.com/a | 2020-11-26 15:31:23 | 20201126 |
| 192.168.22.4 | http://sina.com/a | 2020-11-26 16:21:33 | 20201126 |
| 192.168.33.1 | http://sina.com/a | 2020-11-26 17:11:43 | 20201126 |
| 192.168.22.1 | http://sina.com/a | 2020-11-27 15:31:23 | 20201127 |
| 192.168.22.4 | http://sina.com/a | 2020-11-27 16:21:33 | 20201127 |
| 192.168.33.1 | http://sina.com/a | 2020-11-27 17:11:43 | 20201127 |
| 192.168.12.6 | http://sina.com/v | 2020-11-26 15:31:33 | 20201126 |
| 192.168.12.6 | http://sina.com/v | 2020-11-27 15:31:33 | 20201127 |
| 192.168.22.7 | http://sina.com/f | 2020-11-26 15:31:36 | 20201126 |
| 192.168.22.7 | http://sina.com/f | 2020-11-27 15:31:36 | 20201127 |
| 192.168.22.1 | http://sina.com/u | 2020-11-26 15:31:33 | 20201126 |
| 192.168.22.1 | http://sina.com/u | 2020-11-27 15:31:33 | 20201127 |
+---------------+--------------------+-----------------------+---------------+--+
select url,count(1) as cnts
from t_pv_log
group by url;
+--------------------+-------+--+
| url | cnts |
+--------------------+-------+--+
| http://sina.com/a | 6 |
| http://sina.com/f | 2 |
| http://sina.com/u | 2 |
| http://sina.com/v | 2 |
+--------------------+-------+--+
--语法:group by 字段(字段中相同的数据划分为一组)
--注:只能按组查询字段,故select 被分组的那个字段,对分好组的数据进行逐组运算。
--求每个url访问者中ip地址最大的
select url,max(ip)
from t_pv_log
group by url;
--max():分组聚合函数(对一组数据即多行进行运算)
+--------------------+---------------+--+
| url | _c1 |
+--------------------+---------------+--+
| http://sina.com/a | 192.168.33.1 |
| http://sina.com/f | 192.168.22.7 |
| http://sina.com/u | 192.168.22.1 |
| http://sina.com/v | 192.168.12.6 |
+--------------------+---------------+--+
--求每个用户访问同一页面的所有记录中时间最晚的一条
--表分组的样子
+---------------+--------------------+-----------------------+---------------+--+
| t_pv_log.ip | t_pv_log.url | t_pv_log.access_time | t_pv_log.day |
+---------------+--------------------+-----------------------+---------------+--+
| 192.168.22.1 | http://sina.com/a | 2020-11-26 15:31:23 | 20201126 |
| 192.168.22.1 | http://sina.com/a | 2020-11-27 15:31:23 | 20201127
| 192.168.22.4 | http://sina.com/a | 2020-11-26 16:21:33 | 20201126 |
| 192.168.22.4 | http://sina.com/a | 2020-11-27 16:21:33 | 20201127 |
| 192.168.33.1 | http://sina.com/a | 2020-11-26 17:11:43 | 20201126 |
| 192.168.33.1 | http://sina.com/a | 2020-11-27 17:11:43 | 20201127 |
| 192.168.12.6 | http://sina.com/v | 2020-11-26 15:31:33 | 20201126 |
| 192.168.12.6 | http://sina.com/v | 2020-11-27 15:31:33 | 20201127 |
| 192.168.22.7 | http://sina.com/f | 2020-11-26 15:31:36 | 20201126 |
| 192.168.22.7 | http://sina.com/f | 2020-11-27 15:31:36 | 20201127 |
| 192.168.22.1 | http://sina.com/u | 2020-11-26 15:31:33 | 20201126 |
| 192.168.22.1 | http://sina.com/u | 2020-11-27 15:31:33 | 20201127 |
+---------------+--------------------+-----------------------+---------------+--+
select ip,url,max(access_time) as time --查询的都是分组字段和聚合函数
from t_pv_log
group by ip,url;
+---------------+--------------------+----------------------+--+
| ip | url | time |
+---------------+--------------------+----------------------+--+
| 192.168.12.6 | http://sina.com/v | 2020-11-27 15:31:33 |
| 192.168.22.1 | http://sina.com/a | 2020-11-27 15:31:23 |
| 192.168.22.1 | http://sina.com/u | 2020-11-27 15:31:33 |
| 192.168.22.4 | http://sina.com/a | 2020-11-27 16:21:33 |
| 192.168.22.7 | http://sina.com/f | 2020-11-27 15:31:36 |
| 192.168.33.1 | http://sina.com/a | 2020-11-27 17:11:43 |
+---------------+--------------------+----------------------+--+
--注:结果有多少行就被分了多少组
分组聚合综合示例
--分组聚合综合示例
--1.建表(分区表)
create table t_access(ip string,url string,access_time date)
partitioned by (dt string)
row format delimited
fields terminated by ',';
--2.编辑分区文件
vi access.log.20201211
vi access.log.20201212
vi access.log.20201213
--3.导入数据
load data local inpath '/root/hivetest/access.log.20201211' into table t_access partition(dt='20201211');
load data local inpath '/root/hivetest/access.log.20201212' into table t_access partition(dt='20201212');
load data local inpath '/root/hivetest/access.log.20201213' into table t_access partition(dt='20201213');
--4.查询
--求12月11日以后每天http://www.edu360.cn/job总访问次数及访问者中ip地址最大的,显示url
--方法一:
select dt,'http://www.edu360.cn/job',count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'20201211';
--方法二:
select dt,max(url),count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'20201211';
--方法三:
select dt,url,count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt,url having dt>'20201211';
+-----------+---------------------------+------+----------------+--+
| dt | url | _c2 | _c3 |
+-----------+---------------------------+------+----------------+--+
| 20201212 | http://www.edu360.cn/job | 2 | 192.168.33.55 |
| 20201213 | http://www.edu360.cn/job | 3 | 192.168.33.55 |
+-----------+---------------------------+------+----------------+--+
-- 求12月11号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的
select dt,url,count(1),max(ip)
from t_access
where dt>'20201211'
group by dt,url;
+-----------+---------------------------------+------+----------------+--+
| dt | url | _c2 | _c3 |
+-----------+---------------------------------+------+----------------+--+
| 20201212 | http://www.edu360.cn/job | 2 | 192.168.33.55 |
| 20201212 | http://www.edu360.cn/stu | 2 | 192.168.33.44 |
| 20201212 | http://www.edu360.cn/teach | 1 | 192.168.44.3 |
| 20201213 | http://www.edu360.cn/excersize | 3 | 192.168.33.46 |
| 20201213 | http://www.edu360.cn/job | 3 | 192.168.33.55 |
| 20201213 | http://www.edu360.cn/pay | 1 | 192.168.34.44 |
| 20201213 | http://www.edu360.cn/register | 2 | 192.168.133.3 |
+-----------+---------------------------------+------+----------------+--+
hive 子查询
--hive 子查询
-- 求12月11号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且只查询出总访问次数>2 的记录
-- 方式1:
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>'20201211'
group by dt,url having cnts>'2';
-- 方式2:用子查询
select dt,url,cnts,ip
from
(select dt,url,count(1) as cnts,max(ip) as ip
from t_access
where dt>'20201211'
group by dt,url) tmp
where cnts>'2';
+-----------+---------------------------------+-------+----------------+--+
| dt | url | cnts | ip |
+-----------+---------------------------------+-------+----------------+--+
| 20201213 | http://www.edu360.cn/excersize | 3 | 192.168.33.46 |
| 20201213 | http://www.edu360.cn/job | 3 | 192.168.33.55 |
+-----------+---------------------------------+-------+----------------+--+
hive 数据类型
数字类型、日期时间类型、字符串类型、混杂类型、复合类型(hive独有)
1.array数组类型
--1.array数组类型
--建表
create table t_movie(movie_name string,actors array<string>,first_show date)
row format delimited fields terminated by ','
collection items terminated by ':';
--vi movie.data
无名之辈,章宇:陈建斌:任素汐,2018-11-16
我不是药神,徐峥:王传君:章宇,2018-07-05
一出好戏,黄渤:王宝强:舒淇,2018-08-10
中国机长,欧豪:杜江:袁泉,2018-05-18
囧妈,徐峥:黄梅莹:袁泉,2020-01-25
--导数据
load data local inpath '/root/hivetest/movie.dat' into table t_movie ;
--查询
select movie_name,actors[0],first_show from t_movie;
+-------------+------+-------------+--+
| movie_name | _c1 | first_show |
+-------------+------+-------------+--+
| 无名之辈 | 章宇 | 2018-11-16 |
| 我不是药神 | 徐峥 | 2018-07-05 |
| 一出好戏 | 黄渤 | 2018-08-10 |
| 中国机长 | 欧豪 | 2018-05-18 |
| 囧妈 | 徐峥 | 2020-01-25 |
+-------------+------+-------------+--+
--查询徐峥参演的电影
select movie_name,actors,first_show
from t_movie
where array_contains(actors,'徐峥') ;
+-------------+--------------------+-------------+--+
| movie_name | actors | first_show |
+-------------+--------------------+-------------+--+
| 我不是药神 | ["徐峥","王传君","章宇"] | 2018-07-05 |
| 囧妈 | ["徐峥","黄梅莹","袁泉"] | 2020-01-25 |
+-------------+--------------------+-------------+--+
--查询电影参演人数
select movie_name
,size(actors) as act_num
,first_show
from t_movie;
+-------------+----------+-------------+--+
| movie_name | act_num | first_show |
+-------------+----------+-------------+--+
| 无名之辈 | 3 | 2018-11-16 |
| 我不是药神 | 3 | 2018-07-05 |
| 一出好戏 | 3 | 2018-08-10 |
| 中国机长 | 3 | 2018-05-18 |
| 囧妈 | 3 | 2020-01-25 |
+-------------+----------+-------------+--+
2.map类型
--2.map类型
--建表
create table t_family(id int,name string,familys map<string,string>,age int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
--vi family.dat
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26
--导数据
load data local inpath '/root/hivetest/family.dat' into table t_family;
--查出每个人的爸爸
select id,name,familys['father'],age
from t_family;
+-----+-----------+--------------+------+--+
| id | name | _c2 | age |
+-----+-----------+--------------+------+--+
| 1 | zhangsan | xiaoming | 28 |
| 2 | lisi | mayun | 22 |
| 3 | wangwu | wangjianlin | 29 |
| 4 | mayun | mayongzhen | 26 |
+-----+-----------+--------------+------+--+
--查询出每个人有哪些亲属关系
select id,name,map_keys(familys) as relations,age
from t_family;
+-----+-----------+--------------------------------+------+--+
| id | name | relations | age |
+-----+-----------+--------------------------------+------+--+
| 1 | zhangsan | ["father","mother","brother"] | 28 |
| 2 | lisi | ["father","mother","brother"] | 22 |
| 3 | wangwu | ["father","mother","sister"] | 29 |
| 4 | mayun | ["father","mother"] | 26 |
+-----+-----------+--------------------------------+------+--+
-- 查出每个人的亲人名字
select id,name,map_values(familys) as f_name,age
from t_family;
+-----+-----------+-------------------------------------+------+--+
| id | name | f_name | age |
+-----+-----------+-------------------------------------+------+--+
| 1 | zhangsan | ["xiaoming","xiaohuang","xiaoxu"] | 28 |
| 2 | lisi | ["mayun","huangyi","guanyu"] | 22 |
| 3 | wangwu | ["wangjianlin","ruhua","jingtian"] | 29 |
| 4 | mayun | ["mayongzhen","angelababy"] | 26 |
+-----+-----------+-------------------------------------+------+--+
-- 查出每个人的亲人数量
select id,name,size(familys),age
from t_family;
+-----+-----------+------+------+--+
| id | name | _c2 | age |
+-----+-----------+------+------+--+
| 1 | zhangsan | 3 | 28 |
| 2 | lisi | 3 | 22 |
| 3 | wangwu | 3 | 29 |
| 4 | mayun | 2 | 26 |
+-----+-----------+------+------+--+
-- 查出所有拥有兄弟的人及他的兄弟是谁
--方法1
select id,name,familys['brother'],age
from t_family
where array_contains(map_keys(familys),'brother');
--方法2
select id,name,age,familys['brother']
from
(select id,name,age,map_keys(familys) as relations,familys
from t_family ) tmp
where array_contains(relations,'brother');
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
| id | name | age | relations | familys |
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
| 1 | zhangsan | 28 | ["father","mother","brother"] | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"} |
| 2 | lisi | 22 | ["father","mother","brother"] | {"father":"mayun","mother":"huangyi","brother":"guanyu"} |
| 3 | wangwu | 29 | ["father","mother","sister"] | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"} |
| 4 | mayun | 26 | ["father","mother"] | {"father":"mayongzhen","mother":"angelababy"} |
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
+-----+-----------+------+---------+--+
| id | name | age | _c3 |
+-----+-----------+------+---------+--+
| 1 | zhangsan | 28 | xiaoxu |
| 2 | lisi | 22 | guanyu |
+-----+-----------+------+---------+--+
3.struct类型
--3.struct类型
--建表
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';
--vi user.dat
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,赵六,26:female:上海
5,钱琪,35:male:杭州
6,王八,48:female:南京
--导数据
load data local inpath '/root/hivetest/user.dat' into table t_user;
--查询每个人的id,name,和地址
select id,name,info.addr
from t_user;
+-----+-----------+-------+--+
| id | name | addr |
+-----+-----------+-------+--+
| 1 | zhangsan | 深圳 |
| 2 | lisi | 北京 |
| 3 | wangwu | 广州 |
| 4 | 赵六 | 上海 |
| 5 | 钱琪 | 杭州 |
| 6 | 王八 | 南京 |
+-----+-----------+-------+--+
hive 的函数
字符串函数、类型转换函数、数学运算函数、时间函数、表生成函数
--字符串函数
select substr("abcgfik",1,3); --用常量测试函数
select substr("abcgfik",0,3);
+------+--+
| _c0 |
+------+--+
| abc |
+------+--+
--结果一致
表生成函数
--表生成函数(explode() lateral view的使用)
--wordcount示例
--建表
create table t_wc(sentence string);
--vi word.dat
select name string name
name select from id info
select select id id name info
string from id
id string name
--导数据
load data local inpath '/root/hivetest/word.dat' into table t_wc;
--查询
select explode(split(sentence,' ')) as word --按空格切割,形成数组,利用行转列函数explode()炸开
from t_wc;
+---------+--+
| word |
+---------+--+
| select |
| name |
| string |
| name |
| name |
| select |
| from |
| id |
| info |
| select |
| select |
| id |
| id |
| name |
| info |
| string |
| from |
| id |
| id |
| string |
| name |
+---------+--+
select word,count(1) as conts
from
(
select explode(split(sentence,' ')) as word
from t_wc
) tmp
group by word;
+---------+--------+--+
| word | conts |
+---------+--------+--+
| from | 2 |
| id | 5 |
| info | 2 |
| name | 5 |
| select | 4 |
| string | 3 |
+---------+--------+--+
lateral view
--lateral view
--建表
create table t_subject(id int,name string,subjects array<string>)
row format delimited fields terminated by ' '
collection items terminated by ',';
--vi subjects.txt
001 zhangsan 物理,化学,数学,英语,生物
002 lisi 生物,历史,数学,语文,化学
003 wangwu 体育,美术,化学,语文,英语,数学
--导数据
load data local inpath '/root/hivetest/subject.txt' into table t_subject;
--查询有哪些课程
select distinct tmp.sub
from(
select explode(subjects) as sub
from t_subject
) tmp;
+----------+--+
| tmp.sub |
+----------+--+
| 体育 |
| 化学 |
| 历史 |
| 数学 |
| 物理 |
| 生物 |
| 美术 |
| 英语 |
| 语文 |
+----------+--+
--lateral view 示例(lateral view将explode函数的结果生成新表)
select id,name,tmp.sub
from t_subject
lateral view
explode(subjects) tmp as sub ;
+-----+-----------+----------+--+
| id | name | tmp.sub |
+-----+-----------+----------+--+
| 1 | zhangsan | 物理 |
| 1 | zhangsan | 化学 |
| 1 | zhangsan | 数学 |
| 1 | zhangsan | 英语 |
| 1 | zhangsan | 生物 |
| 2 | lisi | 生物 |
| 2 | lisi | 历史 |
| 2 | lisi | 数学 |
| 2 | lisi | 语文 |
| 2 | lisi | 化学 |
| 3 | wangwu | 体育 |
| 3 | wangwu | 美术 |
| 3 | wangwu | 化学 |
| 3 | wangwu | 语文 |
| 3 | wangwu | 英语 |
| 3 | wangwu | 数学 |
+-----+-----------+----------+--+