<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?><configuration><property><name>javax.jdo.option.ConnectionURL</name><value>jdbc:mysql://hadoop001:3306/myhive?createDatabaseIfNotExist=true&useSSL=false</value><description>the URL of the MySQL database</description></property><property><name>javax.jdo.option.ConnectionDriverName</name><value>com.mysql.jdbc.Driver</value></property><property><name>javax.jdo.option.ConnectionUserName</name><value>hive</value></property><property><name>javax.jdo.option.ConnectionPassword</name><value>123456</value></property></configuration>
建表
create table emp
(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
row format delimited
fields terminated by ',';
create table dept
(
deptno int,
dname string,
loc string
)
row format delimited
fields terminated by ',';
load data local inpath 'dept.csv' overwrite into table dept;
load data local inpath 'emp.csv' overwrite into table emp;
常用命令
SHOW FUNCTIONS;--查看所有内置函数
DESCRIBE FUNCTION <function_name>;--查看某个函数的描述
DESCRIBE FUNCTION EXTENDED <function_name>;--查看某个函数的
desc user_base;--查看表结构
desc formatted user_base;--查看详细表结构
具体使用方法
show tables;
show tables like '*tab*';--模糊查询
show functions like 'a*';--模糊查询
set hive.cli.print.header=true;--参数设置可以参考官网的 Hive wiki=》 Hive Configuration Properties
数组
hb,wh#xy#yc
gd,gz#sz#dg
sc,cd#pzh
create table t_array(
province string,
city array<string>)
comment 'this is table'
row format delimited
fields terminated by ','
collection items terminated by '#'--定义数组的每个数据项的分割符
使用array中元素,访问数组中的某一个元素
hive中数组元素的使用,同样是采用下标的方式,下标从0开始
select province,city[0] from t_array;
数组长度
size()函数
select province,size(city) from t_array where size(city)>2;
确定数组中是否含有某一个元素
arr_contains(数组名,值)!arr_contains(数组名,值)
## 查看数组中的每一个元素explode()explode()会将数组元素展开展示
select explode(city) from t_array;
扩展: 统计词数,可以将整个文件导入到一个列表中中,然后用split函数将其转成数组,再用explode()将其展开,最后在分组统计
比如有如下文件
hello,world,welcome
hello,welcome
you,are,welcome
how,are,you
hive> select split(w,',') from hive_wc;
OK
["hello","world","welcome"]["hello","welcome"]["you","are","welcome"]["how","are","you"]
hive> select wd,count(0) from (select explode(split(w,',')) wd from hive_Wc) t group by wd;
are 2
hello 2
how 1
welcome 3
world 1
you 2
虚拟表 lateral view
lateral view 会将explode生成的结果放到一个虚拟表中,然后这个虚拟表会和输入行即每个province进行join,来达到数据聚合的目的
select province, scity # 注意:这里用的是scity,是和lateral view()保持一致的
from t_array
lateral view explode(city) city as scity;
#要进行聚合的虚拟表,lateral view explode(字段) 虚拟表名 as 虚拟表字段
hive> select province, scity from t_array lateral view explode(city) city as scity;
OK
hb wh
hb xy
hb yc
gd gz
gd sz
gd dg
sc cd
sc pzh
Time taken:0.225 seconds, Fetched:8row(s)
hive> select * from t_array where city is not null;
OK
hb ["wh","xy","yc "]
gd ["gz","sz","dg"]
sc ["cd","pzh "]
Time taken:0.072 seconds, Fetched:3row(s)
hive> select explode(city) from t_array;
OK
wh
xy
yc
gz
sz
dg
cd
pzh
Time taken:0.049 seconds, Fetched:8row(s)
hive> select province, scity from t_array lateral view explode(city) city as scity;
OK
hb wh
hb xy
hb yc
gd gz
gd sz
gd dg
sc cd
sc pzh
Time taken:0.054 seconds, Fetched:8row(s)
hive>
读取json数据
get_json_object :用来解析json字符串的一个字段
json_tuple: 用来解析json字符串中的多个字段
select json_tuple('{"name":"jack","server":"www.qq.com"}','server','name') from src
分区表动态分区,打开开关,将分区字符写在select的最后一列
分析函数
<窗口函数>()
OVER
([PARTITION BY <列清单>][ORDER BY <排序用清单列>][ASC/DESC](ROWS | RANGE)<范围条件>)
函数子句:指明具体操作,如sum-求和,first_value-取第一个值;
partition by子句:指明分区字段,如果没有,则将所有数据作为一个分区;
order by子句:指明了每个分区排序的字段和方式,也是可选的,没有就是按照表中的顺序;
窗口子句:指明相对当前记录的计算范围,可以向上(preceding),可以向下(following),也可以使用between指明,上下边界的值,没有的话默认为当前分区。有些场景比较特殊,后文会讲到这种场景。
--查询部门工资最高的那个人,下面两个语句效果是一样的
select * from (select ename,sal,first_value(sal)over(partition by deptno order by sal desc) as maxsal from emp) t where sal=maxsal;
select * from (select ename,sal,max(sal)over(partition by deptno) as maxsal from emp) t where sal=maxsal;
KING 5000.05000.0
SCOTT 3000.03000.0
FORD 3000.03000.0
BLAKE 2850.02850.0--累计计算 需要配合order by 使用
select deptno,ename,sal,sum(sal)over(partition by deptno order by sal) from emp order by deptno
10 MILLER 1300.01300.010 CLARK 2450.03750.010 KING 5000.08750.020 SCOTT 3000.010875.020 FORD 3000.010875.020 JONES 2975.04875.020 ADAMS 1100.01900.020 SMITH 800.0800.030 TURNER 1500.04950.030 ALLEN 1600.06550.030 MARTIN 1250.03450.030 JAMES 950.0950.030 BLAKE 2850.09400.030 WARD 1250.03450.0
可以实现累计是因为标准聚合函数作为窗口函数使用的时候,在指明order by的情况下,如果没有Window子句,则Window子句默认为:RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW(上边界不限制,下边界到当前行)。
--移动计算
select deptno,ename,sal,sum(sal)over(partition by deptno order by sal rows between 2 preceding and2 following) from emp order by deptno
select deptno,ename,sal,sum(sal)over(partition by deptno order by sal rows 2 preceding) from emp order by deptno
--排序
排序对应的四个窗口函数为:rank、dense_rank、row_number、ntitle
rank:计算排序时,如果存在相同位次的记录,则会跳过之后的位次。
e.g. 有三条记录排在第1位时:1位、1位、1位、4位......
dense_rank:计算排序时,即使存在相同位次的记录,也不会跳过之后的位次。
e.g. 有三条记录排在第1位时:1位、1位、1位、2位......
row_number:赋予唯一的连续位次。
e.g. 有三条记录排在第1位时:1位、2位、3位、4位
数组湖北,武汉#襄阳#宜昌 广东,广州#深圳#东莞四川,成都#攀枝花create table t_array(province string,city array<string>)comment 'this is table'row format delimited fields terminated by '|'collection items terminated by '#' --定义数组的每个数据项的分割符使用array中元素,访问数组中的某一个元素hive