hive查询的相关示例

最新推荐文章于 2022-11-02 21:58:49 发布

醪糟小丸子

最新推荐文章于 2022-11-02 21:58:49 发布

阅读量484

点赞数 2

分类专栏： hive 文章标签： hive 大数据

本文链接：https://blog.csdn.net/laozaoxiaowanzi/article/details/111180084

版权

hive 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

hive基本查询

--hive基本查询
select * from a;
select max(ip) from a;
select count(*) from a;

hive 条件查询

--hive 条件查询
select *
from t_pv_log
where access_time>'2020-11-26 15:31:33';

关联查询

1.内连接

--1.内连接（笛卡尔积），右边字段依次和左边的连一遍
select t_a.*,t_b.*
from a t_a inner join b t_b;

+---------+-----------+---------+-----------+--+
| t_a.id  | t_a.name  | t_b.id  | t_b.name  |
+---------+-----------+---------+-----------+--+
| 1       | a         | 2       | bb        |
| 1       | a         | 3       | cc        |
| 1       | a         | 7       | yy        |
| 1       | a         | 9       | pp        |
| 2       | b         | 2       | bb        |
| 2       | b         | 3       | cc        |
| 2       | b         | 7       | yy        |
| 2       | b         | 9       | pp        |
| 3       | c         | 2       | bb        |
| 3       | c         | 3       | cc        |
| 3       | c         | 7       | yy        |
| 3       | c         | 9       | pp        |
| 4       | d         | 2       | bb        |
| 4       | d         | 3       | cc        |
| 4       | d         | 7       | yy        |
| 4       | d         | 9       | pp        |
| 7       | y         | 2       | bb        |
| 7       | y         | 3       | cc        |
| 7       | y         | 7       | yy        |
| 7       | y         | 9       | pp        |
| 8       | u         | 2       | bb        |
| 8       | u         | 3       | cc        |
| 8       | u         | 7       | yy        |
| 8       | u         | 9       | pp        |
+---------+-----------+---------+-----------+--+

--指定条件 
select t_a.*,t_b.*
from a t_a inner join b t_b
on t_a.id=t_b.id;

+---------+-----------+---------+-----------+--+
| t_a.id  | t_a.name  | t_b.id  | t_b.name  |
+---------+-----------+---------+-----------+--+
| 2       | b         | 2       | bb        |
| 3       | c         | 3       | cc        |
| 7       | y         | 7       | yy        |
+---------+-----------+---------+-----------+--+

2.左连接

--2.左外连接（左连接），左边字段依次和右边的连一遍
select a.*,b.*
from a left join b
on a.id=b.id;

+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
+-------+---------+-------+---------+--+
--加条件：保留左边所有字段，右边没有符合左边字段的就为null

3.右连接

--3.右外连接（右连接），不加条件和内联接查询结果相同
select a.*,b.*
from a right join b
on a.id=b.id;

+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 7     | y       | 7     | yy      |
| NULL  | NULL    | 9     | pp      |
+-------+---------+-------+---------+--+
--加条件：保留右边所有字段，左边没有符合右边字段的就为null

4.全外连接

--4.全外连接
select a.*,b.*
from a full outer join b
on a.id=b.id;

+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
| NULL  | NULL    | 9     | pp      |
+-------+---------+-------+---------+--+

--加条件：左边右边字段都会保存

5.左半连接（hive独有）

--5.左半连接（hive独有）
select a.*
from a left semi join b
on a.id=b.id;

+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 2     | b       |
| 3     | c       |
| 7     | y       |
+-------+---------+--+

--查询得到与右表相同字段数据的左表数据（故select子句中不能有右表字段）

group by 分组聚合查询

select upper("abcde");--转大写函数

-- 针对每一行计算，指定字段的每行数据都会变
select ip,upper(url),access_time
from t_pv_log;

--求每条url的访问总次数
+---------------+--------------------+-----------------------+---------------+--+
|  t_pv_log.ip  |    t_pv_log.url    | t_pv_log.access_time  | t_pv_log.day  |
+---------------+--------------------+-----------------------+---------------+--+
| 192.168.22.1  | http://sina.com/a  | 2020-11-26 15:31:23   | 20201126      |
| 192.168.22.4  | http://sina.com/a  | 2020-11-26 16:21:33   | 20201126      |
| 192.168.33.1  | http://sina.com/a  | 2020-11-26 17:11:43   | 20201126      |
| 192.168.22.1  | http://sina.com/a  | 2020-11-27 15:31:23   | 20201127      |
| 192.168.22.4  | http://sina.com/a  | 2020-11-27 16:21:33   | 20201127      |
| 192.168.33.1  | http://sina.com/a  | 2020-11-27 17:11:43   | 20201127      |

| 192.168.12.6  | http://sina.com/v  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.12.6  | http://sina.com/v  | 2020-11-27 15:31:33   | 20201127      |

| 192.168.22.7  | http://sina.com/f  | 2020-11-26 15:31:36   | 20201126      |
| 192.168.22.7  | http://sina.com/f  | 2020-11-27 15:31:36   | 20201127      |

| 192.168.22.1  | http://sina.com/u  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.22.1  | http://sina.com/u  | 2020-11-27 15:31:33   | 20201127      |
+---------------+--------------------+-----------------------+---------------+--+

select url,count(1) as cnts
from t_pv_log
group by url;

+--------------------+-------+--+
|        url         | cnts  |
+--------------------+-------+--+
| http://sina.com/a  | 6     |
| http://sina.com/f  | 2     |
| http://sina.com/u  | 2     |
| http://sina.com/v  | 2     |
+--------------------+-------+--+
--语法：group  by 字段（字段中相同的数据划分为一组）
--注：只能按组查询字段，故select 被分组的那个字段，对分好组的数据进行逐组运算。

--求每个url访问者中ip地址最大的
select url,max(ip)
from t_pv_log
group by url;

--max():分组聚合函数（对一组数据即多行进行运算）

+--------------------+---------------+--+
|        url         |      _c1      |
+--------------------+---------------+--+
| http://sina.com/a  | 192.168.33.1  |
| http://sina.com/f  | 192.168.22.7  |
| http://sina.com/u  | 192.168.22.1  |
| http://sina.com/v  | 192.168.12.6  |
+--------------------+---------------+--+


--求每个用户访问同一页面的所有记录中时间最晚的一条
--表分组的样子
+---------------+--------------------+-----------------------+---------------+--+
|  t_pv_log.ip  |    t_pv_log.url    | t_pv_log.access_time  | t_pv_log.day  |
+---------------+--------------------+-----------------------+---------------+--+
| 192.168.22.1  | http://sina.com/a  | 2020-11-26 15:31:23   | 20201126      |
| 192.168.22.1  | http://sina.com/a  | 2020-11-27 15:31:23   | 20201127      

| 192.168.22.4  | http://sina.com/a  | 2020-11-26 16:21:33   | 20201126      |
| 192.168.22.4  | http://sina.com/a  | 2020-11-27 16:21:33   | 20201127      |

| 192.168.33.1  | http://sina.com/a  | 2020-11-26 17:11:43   | 20201126      |
| 192.168.33.1  | http://sina.com/a  | 2020-11-27 17:11:43   | 20201127      |

| 192.168.12.6  | http://sina.com/v  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.12.6  | http://sina.com/v  | 2020-11-27 15:31:33   | 20201127      |

| 192.168.22.7  | http://sina.com/f  | 2020-11-26 15:31:36   | 20201126      |
| 192.168.22.7  | http://sina.com/f  | 2020-11-27 15:31:36   | 20201127      |

| 192.168.22.1  | http://sina.com/u  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.22.1  | http://sina.com/u  | 2020-11-27 15:31:33   | 20201127      |
+---------------+--------------------+-----------------------+---------------+--+

select ip,url,max(access_time) as time --查询的都是分组字段和聚合函数
from t_pv_log
group by ip,url;

+---------------+--------------------+----------------------+--+
|      ip       |        url         |         time         |
+---------------+--------------------+----------------------+--+
| 192.168.12.6  | http://sina.com/v  | 2020-11-27 15:31:33  |
| 192.168.22.1  | http://sina.com/a  | 2020-11-27 15:31:23  |
| 192.168.22.1  | http://sina.com/u  | 2020-11-27 15:31:33  |
| 192.168.22.4  | http://sina.com/a  | 2020-11-27 16:21:33  |
| 192.168.22.7  | http://sina.com/f  | 2020-11-27 15:31:36  |
| 192.168.33.1  | http://sina.com/a  | 2020-11-27 17:11:43  |
+---------------+--------------------+----------------------+--+
--注：结果有多少行就被分了多少组

分组聚合综合示例

--分组聚合综合示例
--1.建表(分区表）
create table t_access(ip string,url string,access_time date)
partitioned by (dt string)
row format delimited 
fields terminated by ',';
--2.编辑分区文件
vi access.log.20201211
vi access.log.20201212
vi access.log.20201213
--3.导入数据
load data local inpath '/root/hivetest/access.log.20201211' into table t_access partition(dt='20201211');
load data local inpath '/root/hivetest/access.log.20201212' into table t_access partition(dt='20201212');
load data local inpath '/root/hivetest/access.log.20201213' into table t_access partition(dt='20201213');
--4.查询
--求12月11日以后每天http://www.edu360.cn/job总访问次数及访问者中ip地址最大的，显示url
--方法一：

select dt,'http://www.edu360.cn/job',count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'20201211';

--方法二：
select dt,max(url),count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'20201211';

--方法三：
select dt,url,count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt,url having dt>'20201211';

+-----------+---------------------------+------+----------------+--+
|    dt     |            url            | _c2  |      _c3       |
+-----------+---------------------------+------+----------------+--+
| 20201212  | http://www.edu360.cn/job  | 2    | 192.168.33.55  |
| 20201213  | http://www.edu360.cn/job  | 3    | 192.168.33.55  |
+-----------+---------------------------+------+----------------+--+

-- 求12月11号以后，每天每个页面的总访问次数，及访问者中ip地址中最大的

select dt,url,count(1),max(ip)
from t_access
where dt>'20201211'
group by dt,url;

+-----------+---------------------------------+------+----------------+--+
|    dt     |               url               | _c2  |      _c3       |
+-----------+---------------------------------+------+----------------+--+
| 20201212  | http://www.edu360.cn/job        | 2    | 192.168.33.55  |
| 20201212  | http://www.edu360.cn/stu        | 2    | 192.168.33.44  |
| 20201212  | http://www.edu360.cn/teach      | 1    | 192.168.44.3   |
| 20201213  | http://www.edu360.cn/excersize  | 3    | 192.168.33.46  |
| 20201213  | http://www.edu360.cn/job        | 3    | 192.168.33.55  |
| 20201213  | http://www.edu360.cn/pay        | 1    | 192.168.34.44  |
| 20201213  | http://www.edu360.cn/register   | 2    | 192.168.133.3  |
+-----------+---------------------------------+------+----------------+--+

hive 子查询

--hive 子查询

-- 求12月11号以后，每天每个页面的总访问次数，及访问者中ip地址中最大的，且只查询出总访问次数>2 的记录
-- 方式1：
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>'20201211'
group by dt,url having cnts>'2';

-- 方式2：用子查询
select dt,url,cnts,ip
from 
(select dt,url,count(1) as cnts,max(ip) as ip
from t_access
where dt>'20201211'
group by dt,url) tmp
where cnts>'2';

+-----------+---------------------------------+-------+----------------+--+
|    dt     |               url               | cnts  |       ip       |
+-----------+---------------------------------+-------+----------------+--+
| 20201213  | http://www.edu360.cn/excersize  | 3     | 192.168.33.46  |
| 20201213  | http://www.edu360.cn/job        | 3     | 192.168.33.55  |
+-----------+---------------------------------+-------+----------------+--+

hive 数据类型

数字类型、日期时间类型、字符串类型、混杂类型、复合类型（hive独有）

1.array数组类型

--1.array数组类型

--建表
create table t_movie(movie_name string,actors array<string>,first_show date)
row  format  delimited fields terminated by ','
collection items terminated by ':';

--vi movie.data
无名之辈,章宇:陈建斌:任素汐,2018-11-16
我不是药神,徐峥:王传君:章宇,2018-07-05
一出好戏,黄渤:王宝强:舒淇,2018-08-10
中国机长,欧豪:杜江:袁泉,2018-05-18
囧妈,徐峥:黄梅莹:袁泉,2020-01-25

--导数据
load data local inpath '/root/hivetest/movie.dat' into table t_movie ;

--查询
select movie_name,actors[0],first_show from t_movie;
+-------------+------+-------------+--+
| movie_name  | _c1  | first_show  |
+-------------+------+-------------+--+
| 无名之辈        | 章宇   | 2018-11-16  |
| 我不是药神       | 徐峥   | 2018-07-05  |
| 一出好戏        | 黄渤   | 2018-08-10  |
| 中国机长        | 欧豪   | 2018-05-18  |
| 囧妈          | 徐峥   | 2020-01-25  |
+-------------+------+-------------+--+

--查询徐峥参演的电影
select movie_name,actors,first_show
from t_movie
where array_contains(actors,'徐峥') ;

+-------------+--------------------+-------------+--+
| movie_name  |       actors       | first_show  |
+-------------+--------------------+-------------+--+
| 我不是药神       | ["徐峥","王传君","章宇"]  | 2018-07-05  |
| 囧妈          | ["徐峥","黄梅莹","袁泉"]  | 2020-01-25  |
+-------------+--------------------+-------------+--+

--查询电影参演人数
select movie_name
,size(actors) as act_num
,first_show
from t_movie;

+-------------+----------+-------------+--+
| movie_name  | act_num  | first_show  |
+-------------+----------+-------------+--+
| 无名之辈        | 3        | 2018-11-16  |
| 我不是药神       | 3        | 2018-07-05  |
| 一出好戏        | 3        | 2018-08-10  |
| 中国机长        | 3        | 2018-05-18  |
| 囧妈          | 3        | 2020-01-25  |
+-------------+----------+-------------+--+

2.map类型

--2.map类型

--建表
create table t_family(id int,name string,familys map<string,string>,age int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

--vi family.dat
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26

--导数据
load data local inpath '/root/hivetest/family.dat' into table t_family;

--查出每个人的爸爸
select id,name,familys['father'],age
from t_family;

+-----+-----------+--------------+------+--+
| id  |   name    |     _c2      | age  |
+-----+-----------+--------------+------+--+
| 1   | zhangsan  | xiaoming     | 28   |
| 2   | lisi      | mayun        | 22   |
| 3   | wangwu    | wangjianlin  | 29   |
| 4   | mayun     | mayongzhen   | 26   |
+-----+-----------+--------------+------+--+

--查询出每个人有哪些亲属关系
select id,name,map_keys(familys) as relations,age
from t_family;
+-----+-----------+--------------------------------+------+--+
| id  |   name    |           relations            | age  |
+-----+-----------+--------------------------------+------+--+
| 1   | zhangsan  | ["father","mother","brother"]  | 28   |
| 2   | lisi      | ["father","mother","brother"]  | 22   |
| 3   | wangwu    | ["father","mother","sister"]   | 29   |
| 4   | mayun     | ["father","mother"]            | 26   |
+-----+-----------+--------------------------------+------+--+

-- 查出每个人的亲人名字
select id,name,map_values(familys) as f_name,age
from t_family;
+-----+-----------+-------------------------------------+------+--+
| id  |   name    |               f_name                | age  |
+-----+-----------+-------------------------------------+------+--+
| 1   | zhangsan  | ["xiaoming","xiaohuang","xiaoxu"]   | 28   |
| 2   | lisi      | ["mayun","huangyi","guanyu"]        | 22   |
| 3   | wangwu    | ["wangjianlin","ruhua","jingtian"]  | 29   |
| 4   | mayun     | ["mayongzhen","angelababy"]         | 26   |
+-----+-----------+-------------------------------------+------+--+

-- 查出每个人的亲人数量
select id,name,size(familys),age
from t_family;
+-----+-----------+------+------+--+
| id  |   name    | _c2  | age  |
+-----+-----------+------+------+--+
| 1   | zhangsan  | 3    | 28   |
| 2   | lisi      | 3    | 22   |
| 3   | wangwu    | 3    | 29   |
| 4   | mayun     | 2    | 26   |
+-----+-----------+------+------+--+

-- 查出所有拥有兄弟的人及他的兄弟是谁
--方法1
select id,name,familys['brother'],age
from t_family
where array_contains(map_keys(familys),'brother');

--方法2
select id,name,age,familys['brother']
from
(select id,name,age,map_keys(familys) as relations,familys
from t_family ) tmp
where array_contains(relations,'brother');

+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
| id  |   name    | age  |           relations            |                            familys                             |
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
| 1   | zhangsan  | 28   | ["father","mother","brother"]  | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"}  |
| 2   | lisi      | 22   | ["father","mother","brother"]  | {"father":"mayun","mother":"huangyi","brother":"guanyu"}       |
| 3   | wangwu    | 29   | ["father","mother","sister"]   | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"}  |
| 4   | mayun     | 26   | ["father","mother"]            | {"father":"mayongzhen","mother":"angelababy"}                  |
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+

+-----+-----------+------+---------+--+
| id  |   name    | age  |   _c3   |
+-----+-----------+------+---------+--+
| 1   | zhangsan  | 28   | xiaoxu  |
| 2   | lisi      | 22   | guanyu  |
+-----+-----------+------+---------+--+

3.struct类型

--3.struct类型

--建表
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';

--vi user.dat
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,赵六,26:female:上海
5,钱琪,35:male:杭州
6,王八,48:female:南京

--导数据
load data local inpath '/root/hivetest/user.dat' into table t_user;

--查询每个人的id,name,和地址
select id,name,info.addr
from t_user;
+-----+-----------+-------+--+
| id  |   name    | addr  |
+-----+-----------+-------+--+
| 1   | zhangsan  | 深圳    |
| 2   | lisi      | 北京    |
| 3   | wangwu    | 广州    |
| 4   | 赵六        | 上海    |
| 5   | 钱琪        | 杭州    |
| 6   | 王八        | 南京    |
+-----+-----------+-------+--+

hive 的函数

字符串函数、类型转换函数、数学运算函数、时间函数、表生成函数

--字符串函数
select substr("abcgfik",1,3); --用常量测试函数
select substr("abcgfik",0,3);
+------+--+
| _c0  |
+------+--+
| abc  |
+------+--+
--结果一致

表生成函数

--表生成函数（explode()    lateral view的使用）
--wordcount示例
--建表
create table t_wc(sentence string);


--vi word.dat
select name string name
name select from id info
select select id id name info
string from id
id string name

--导数据
load data local inpath '/root/hivetest/word.dat' into table t_wc;

--查询
select  explode(split(sentence,' ')) as word --按空格切割，形成数组，利用行转列函数explode()炸开
from t_wc;

+---------+--+
|  word   |
+---------+--+
| select  |
| name    |
| string  |
| name    |
| name    |
| select  |
| from    |
| id      |
| info    |
| select  |
| select  |
| id      |
| id      |
| name    |
| info    |
| string  |
| from    |
| id      |
| id      |
| string  |
| name    |
+---------+--+

select word,count(1) as conts
from
(
select  explode(split(sentence,' ')) as word
from t_wc
) tmp
group by word;

+---------+--------+--+
|  word   | conts  |
+---------+--------+--+
| from    | 2      |
| id      | 5      |
| info    | 2      |
| name    | 5      |
| select  | 4      |
| string  | 3      |
+---------+--------+--+

lateral view

--lateral view
--建表
create table t_subject(id int,name string,subjects array<string>)
row format delimited fields terminated by ' '
collection items terminated by ',';

--vi subjects.txt
001 zhangsan 物理,化学,数学,英语,生物
002 lisi 生物,历史,数学,语文,化学
003 wangwu 体育,美术,化学,语文,英语,数学

--导数据
load data local inpath '/root/hivetest/subject.txt' into table t_subject;


--查询有哪些课程
select distinct tmp.sub
from(
select explode(subjects) as sub 
from t_subject
) tmp;
+----------+--+
| tmp.sub  |
+----------+--+
| 体育       |
| 化学       |
| 历史       |
| 数学       |
| 物理       |
| 生物       |
| 美术       |
| 英语       |
| 语文       |
+----------+--+

--lateral view 示例(lateral view将explode函数的结果生成新表)
select id,name,tmp.sub
from t_subject
lateral view
explode(subjects) tmp as sub ;
+-----+-----------+----------+--+
| id  |   name    | tmp.sub  |
+-----+-----------+----------+--+
| 1   | zhangsan  | 物理       |
| 1   | zhangsan  | 化学       |
| 1   | zhangsan  | 数学       |
| 1   | zhangsan  | 英语       |
| 1   | zhangsan  | 生物       |
| 2   | lisi      | 生物       |
| 2   | lisi      | 历史       |
| 2   | lisi      | 数学       |
| 2   | lisi      | 语文       |
| 2   | lisi      | 化学       |
| 3   | wangwu    | 体育       |
| 3   | wangwu    | 美术       |
| 3   | wangwu    | 化学       |
| 3   | wangwu    | 语文       |
| 3   | wangwu    | 英语       |
| 3   | wangwu    | 数学       |
+-----+-----------+----------+--+