HiveQL数据操作
1.向表中装载数据
load data local inpath '/data/employees'
overwrite into table employees
partition (country='US',state='CA')
如果分区目录不存在,此命令会自动创建分区目录,再将文件拷贝到分区目录下
如果目标表是非分区表,则省略partition子句
Local关键字表示本地目录,省略则表示HDFS中的目录
overwrite关键字表示覆盖性的对表数据进行装载
2.通过查询语句向表中插入数据
hive> insert overwrite table employees05 select * from employees; #将查询结果插入
hive> create table employees05 as select * from employees; #创建表的同时插入数据
#从现有表向分区表插入数据
create table staged_employees(name string,country string,state string);
insert into staged_employees
values('zhao','China','OR'),
('qian','US','OR'),
('sun','US','CA'),
('li','US','IL'),
('zhou','US','OR'),
('wu','US','CA'),
('zheng','US','IL');
create table employees05(name string)
partitioned by (country string,state string)
row format delimited lines terminated by '\n';
insert overwrite table employees05
partition(country='US',state='OR')
select name from staged_employees se
where se.country='US' and se.state='OR';
#一次扫描数据,同时使用多种方式处理数据(目标表可以是分区表或是非分区表均可)
from staged_employees se
insert overwrite table employees05
partition(country='US',state='OR')
select name where se.country='US' and se.state='OR'
insert overwrite table employees05
partition(country='US',state='CA')
select name where se.country='US' and se.state='CA'
insert overwrite table employees05
partition(country='US',state='IL')
select name where se.country='US' and se.state='IL';
执行结果如下
hive (onhive)> select * from employees05;
OK
employees05.name employees05.country employees05.state
sun US CA
wu US CA
li US IL
zheng US IL
qian US OR
zhou US OR
hive (onhive)> show partitions employees05;
OK
partition
country=US/state=CA
country=US/state=IL
country=US/state=OR
#动态分区插入
假设staged_emloyees表中存在100个国家和州的话,执行下边的查询成功后,表employees05表中将会有100个分区。
hive (onhive)> insert overwrite table employees05
> partition(country,state)
> select name,country,state from staged_employees
> where country='US';
FAILED: SemanticException [Error 10096]: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict
此处报错:原因是此版本(hive2.3.6)动态分区功能默认情况下是启用的,以“严格”模式执行,在这种模式下要求至少有一列分区字段是静态的。这种“严格”模式有助于阻止因涉及错误导致查询产生大量的分区。
#默认的动态分区属性
[root@bigdata hive_query]# hive -S -e "set" |grep 'hive.exec.*dynamic.*'
hive.exec.dynamic.partition=true #设置成true,表示开启动态分区功能
hive.exec.dynamic.partition.mode=strict #设置成nonstrict,表示允许所有分区都是动态的
hive.exec.max.dynamic.partitions=1000 #一个动态分区创建语句可以创建的最大动态分区个数
hive.exec.max.dynamic.partitions.pernode=100 #每个mapper或reducer可以创建的最大动态分区个数
[root@bigdata hive_query]# hive -S -e "set" |grep 'hive.exec.max.created.files'
hive.exec.max.created.files=100000 #全局可以创建的最大文件个数,有一个Hadoop计数器会跟踪记录创建了多少个文件。
#动、静态分区混用插入
静态分区键必须出现在动态分区键之前
insert overwrite table employees05
partition(country='US',state)
select name,state from staged_employees
where country='US';
hive (onhive)> select * from employees05;
OK
employees05.name employees05.country employees05.state
sun US CA
wu US CA
li US IL
zheng US IL
qian US OR
zhou US OR
hive (onhive)> show partitions employees05;
OK
partition
country=US/state=CA
country=US/state=IL
country=US/state=OR
3.导出数据
目的:从表中导出数据
#1)数据文件是用户需要的格式:拷贝文件夹或文件
hadoop fs -cp source_path target_path
#2)输出自定义数据格式
hive (onhive)> insert overwrite local directory '/tmp/ca_employees'
> select * from employees05 where state='CA';
[root@bigdata hive_query]# cd /tmp/ca_employees/
[root@bigdata ca_employees]# ls
000000_0
hive (onhive)> !ls /tmp/ca_employees;
000000_0
hive (onhive)> !cat /tmp/ca_employees/000000_0;
sunUSCA
wuUSCA
和向表中插入数据一样,用户也可以通过如下方式指定多个输出文件夹目录:
--输出到HDFS
hive (onhive)> from staged_employees se
> insert overwrite directory '/tmp/or_emp'
> select * where se.country='US' and se.state='OR'
> insert overwrite directory '/tmp/ca_emp'
> select * where se.country='US' and se.state='CA'
> insert overwrite directory '/tmp/IL_emp'
> select * where se.country='US' and se.state='IL';
hive (onhive)> dfs -ls /tmp/*emp;
Found 1 items
-rwx-wx-wx 3 root supergroup 21 2019-09-17 14:16 /tmp/IL_emp/000000_0
Found 1 items
-rwx-wx-wx 3 root supergroup 19 2019-09-17 14:16 /tmp/ca_emp/000000_0
Found 1 items
-rwx-wx-wx 3 root supergroup 22 2019-09-17 14:16 /tmp/or_emp/000000_0
--输出到本地
hive (onhive)> from staged_employees se
> insert overwrite local directory '/tmp/or_emp'
> select * where se.country='US' and se.state='OR'
> insert overwrite local directory '/tmp/ca_emp'
> select * where se.country='US' and se.state='CA'
> insert overwrite local directory '/tmp/IL_emp'
> select * where se.country='US' and se.state='IL';
[root@bigdata ca_employees]# ls -l /tmp |grep emp
drwxr-xr-x 2 root root 43 Sep 17 14:29 ca_emp
drwxr-xr-x 2 root root 43 Sep 17 13:52 ca_employees
drwxr-xr-x 2 root root 43 Sep 17 14:29 IL_emp
drwxr-xr-x 2 root root 43 Sep 17 14:29 or_emp
HiveQL查询
1.SELECT …FROM
hive (onhive)>
> desc employees05;
OK
col_name data_type comment
name string
country string
state string
# Partition Information
# col_name data_type comment
country string
state string
Time taken: 3.062 seconds, Fetched: 9 row(s)
--查询过程中表,字段别名的使用与Oracle类似,AS关键字可用或不用
hive (onhive)> select em.name n,em.country c,em.state AS s from employees05 em;
OK
n c s
sun US CA
wu US CA
li US IL
zheng US IL
qian US OR
zhou US OR
2.JSON类型的查询(MAP,ARRAY,STRUCT)
此处可以回顾【hive编程-琐碎02】中的内容
CREATE TABLE onhive.employees(
name STRING,
sa1ary FLOAT,
subordinates ARRAY<STRING>,
deductions MAP<STRING, FLOAT>,
address STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' -- 列分隔符
COLLECTION ITEMS TERMINATED BY '_' --MAP STRUCT 和 ARRAY 的分隔符(数据分割符号)
MAP KEYS TERMINATED BY ':' -- MAP中的key与value的分隔符
LINES TERMINATED BY '\n'; -- 行分隔符
测试数据:
vi test.txt
John Doe,100000.0,Mary Smith_Todd Jones,Federal Taxes:0.2_State Taxes:0.05_Insurance:0.1,1 Michigan Ave._Chicago_1L_60600
Tom Smith,90000.0,Jan_Hello Ketty,Federal Taxes:0.2_State Taxes:0.05_Insurance:0.1,Guang dong._China_0.5L_60661
ps:注意,MAP,STRUCT和ARRAY里的元素间关系都可以用同一个字符表示,这里用“_”。
导入数据
load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/test.txt' into table onhive.employees;
#local表示本地路径中寻找目标文件,不加local则到HDFS目录寻找
hive> select * from employees;
OK
employees.name employees.sa1ary employees.subordinates employees.deductions employees.address
John Doe 100000.0 ["Mary Smith","Todd Jones"] {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1} {"street":"1 Michigan Ave.","city":"Chicago","state":"1L","zip":60600}
Tom Smith 90000.0 ["Jan","Hello Ketty"] {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1} {"street":"Guang dong.","city":"China","state":"0.5L","zip":60661}
Time taken: 3.839 seconds, Fetched: 2 row(s)
#Array,Map,Struct的数据取用方式:
hive> select subordinates[0],deductions['Federal Taxes'],address.city from employees;
OK
_c0 _c1 city
Mary Smith 0.2 Chicago
Jan 0.2 China
Time taken: 0.841 seconds, Fetched: 2 row(s)
3.Limit子句
LIMIT子句用于限制返回行数,oracle中可使用where rownum<3控制
hive (onhive)> select em.name n,em.country c, em.state as s from employees05 em limit 3;
OK
n c s
sun US CA
wu US CA
li US IL
4.Like and RLike
1)Like 用于模糊匹配类似的值
2)%表示0或多个任意字符
_代表一个字符
3)RLike子句是Hive中的扩展功能,可以通过Java正则表达式这个更强大的语言来指定匹配条件
hive (onhive)> select * from employees05 where state rlike '[C]';
OK
employees05.name employees05.country employees05.state
sun US CA
wu US CA
hive (onhive)> select * from employees05 where state like 'C%';
OK
employees05.name employees05.country employees05.state
sun US CA
wu US CA
5.GROUP BY
使用方法与Oracle一致,出现在select子句中的字段而不在group by中的字段必须在分组函数中,否则报错
hive (onhive)> select country,state,count(*) from staged_employees group by country,state;
country state _c2
China OR 1
US CA 2
US IL 2
US OR 2
6.Having
having 与where的不同点:
1)where针对表中列筛选数据;having针对查询结果的列筛选数据
2)where子句不能有分组函数;having可以
3)使用having必须存在group by
hive (onhive)> select country,state,count(*) from staged_employees group by country,state having count(*) >1;
country state _c2
US CA 2
US IL 2
US OR 2
7.CASE … WHEN … THEN
hive (onhive)>
> select name,sa1ary,
> case
> when sa1ary <5000.0 then 'low'
> when sa1ary >=5000.0 and sa1ary<7000.0 then 'mid'
> when sa1ary>=7000.0 then 'high'
> end as bracket from employees;
OK
name sa1ary bracket
John Doe 100000.0 high
Tom Smith 90000.0 high
8.避免使用MapReduce的情况(本地模式)
1)对表的简单读取
select * from tab_name;
2)where 语句过滤条件中只是分区字段(无论是否使用limit限制输出记录条数)
select * from employees05 where country ='US' and state='CA' limit 100;
3)属性hive.exec.mode.local.auto值设置成true,Hive还会尝试使用本地模式执行其他的操作。
hive (onhive)> set hive.exec.mode.local.auto;
hive.exec.mode.local.auto=false
9.浮点数的比较
对于一个float类型,用户输入0.2,实际hive存储的值为0.2000001
对于一个double类型,用户输入0.2,实际hive存储的值为0.200000000000001
在查询时过滤条件为 where float_col>0.2 ,此处的0.2对于hive是存储成double类型的
总之在浮点数中0.2的实际值>0.2,而且,cast(0.2 as float)> cast(0.2 as double)
所以在浮点数尤其是float和double类型进行比较时容易出现错误
解决:
1)显示转换成统一类型的浮点数进行比较 cast(0.2 as float)
2)在金钱相关的列避免使用浮点数
Join
1.等值Join
Hive支持通常的SQL JOIN语句,但是只支持等值连接,不支持非等值连接,连接谓词中不支持or
hive (onhive)> select se.name from staged_employees se,employees05 e where se.name=e.name;
--等价于
hive (onhive)> select se.name from staged_employees se join employees05 e on se.name=e.name;
2.left join
A left join B :返回的集合行数以连接左边的表A为准,集合中B表的部分,保留A与B的交集,与A不匹配的行以NULL填充
#为测试实验效果重新装载stage_employees
[root@bigdata hive_query]# vi test01.txt
zhang,China,CA
wang,US,CA
zhao,UK,BE
lin,China,CA
lang,US,CA
wu,US,CA
li,US,IL
zheng,US,IL
qian,US,OR
zhou,US,OR
[root@bigdata hive_query]# hive -S -e "load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/test01.txt' overwrite into table staged_employees";
hive (onhive)> select * from employees05;
OK
employees05.name employees05.country employees05.state
sun US CA
wu US CA
li US IL
zheng US IL
qian US OR
zhou US OR
hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se left join employees05 e on se.name = e.name;
se.name se.country se.state e.name e.country e.state
zhang China CA NULL NULL NULL
wang US CA NULL NULL NULL
zhao UK BE NULL NULL NULL
lin China CA NULL NULL NULL
lang US CA NULL NULL NULL
wu US CA wu US CA
li US IL li US IL
zheng US IL zheng US IL
qian US OR qian US OR
zhou US OR zhou US OR
3.rigth join
A rigth join B :返回的集合行数以连接右边的表B为准,集合中A表的部分,保留A与B的交集,与B不匹配的行以NULL填充
hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se right join employees05 e on se.name = e.name;
se.name se.country se.state e.name e.country e.state
NULL NULL NULL sun US CA
wu US CA wu US CA
li US IL li US IL
zheng US IL zheng US IL
qian US OR qian US OR
zhou US OR zhou US OR
4.full join
A full join B :返回A与B的并集,A与B中不与彼此匹配的行以NULL填充
hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se right join employees05 e on se.name = e.name;
se.name se.country se.state e.name e.country e.state
lang US CA NULL NULL NULL
li US IL li US IL
lin China CA NULL NULL NULL
qian US OR qian US OR
NULL NULL NULL sun US CA
wang US CA NULL NULL NULL
wu US CA wu US CA
zhang China CA NULL NULL NULL
zhao UK BE NULL NULL NULL
zheng US IL zheng US IL
zhou US OR zhou US OR
5.每个MapReduce内部排序(Sort By)
Sort By:每个MapReduce内部进行排序,(区内排序)对全局结果集来说不是排序。
1)设置reduce个数
hive (default)> set mapreduce.job.reduces=3;
2)查看设置reduce个数
hive (default)> set mapreduce.job.reduces;
hive (onhive)> select * from staged_employees sort by name;
6.分区排序(Distribute By)
Distribute By:类似MR中partition,进行分区,结合sort by使用。
注意,Hive要求DISTRIBUTE BY语句要写在SORT BY语句之前。
对于distribute by进行测试,一定要分配多reduce进行处理,否则无法看到distribute by的效果。
hive (onhive)> select * from staged_employees distribute by country sort by state;
7.Cluster By
当distribute by和sorts by字段相同时,可以使用cluster by方式。
cluster by除了具有distribute by的功能外还兼具sort by的功能。但是排序只能是倒序排序,不能指定排序规则为ASC或者DESC。
1)以下两种写法等价
hive (default)> select * from emp cluster by deptno;
hive (default)> select * from emp distribute by deptno sort by deptno;
注意:按照部门编号分区,不一定就是固定死的数值,可以是20号和30号部门分到一个分区里面去。
hive (onhive)> select * from staged_employees distribute by country sort by country;
staged_employees.name staged_employees.country staged_employees.state
lin China CA
zhang China CA
zhou US OR
qian US OR
zheng US IL
li US IL
wu US CA
lang US CA
wang US CA
zhao UK BE
分桶及抽样查询
1.分桶表数据存储##
分区针对的是数据的存储路径;分桶针对的是数据文件。
分区提供一个隔离数据和优化查询的便利方式。不过,并非所有的数据集都可形成合理的分区,特别是之前所提到过的要确定合适的划分大小这个疑虑。
分桶是将数据集分解成更容易管理的若干部分的另一个技术。
(1)创建分桶表
create table stu_buck(id int, name string)
clustered by(id)
into 4 buckets
row format delimited fields terminated by '\t';
(2)查看表结构
hive (onhive)> desc formatted stu_buck;
Num Buckets: 4
(3)准备数据
[root@bigdata hive_query]# vi stu_buck.txt
100 s100
101 s1o1
102 s102
103 s103
104 s104
105 s105
106 s106
110 s110
112 s112
120 s120
(4)导入数据-失败
hive (onhive)> load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/stu_buck.txt' into table stu_buck;
FAILED: SemanticException Please load into an intermediate table and use 'insert... select' to allow Hive to enforce bucketing. Load into bucketed tables are disabled for safety reasons. If you know what you are doing, please sethive.strict.checks.bucketing to false and that hive.mapred.mode is not set to 'strict' to proceed. Note that if you may get errors or incorrect results if you make a mistake while using some of the unsafe features.
--此处提示:出于安全考虑,直接向buck表导入数据默认是不允许的,可以使用insert..select 方式进行导入数据,如果坚持直接导入这种操作需要设置参数hive.strict.checks.bucketing to false;hive.mapred.mode is not set to 'strict'
换个策略:创建普通表,并用子查询方式导入数据
(1)先建一个普通的stu表
create table stu(id int, name string)
row format delimited fields terminated by '\t';
(2)向普通的stu表中导入数据
load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/stu_buck.txt' into table stu;
(3)清空stu_buck表中数据
truncate table stu_buck;
select * from stu_buck;
(4)导入数据到分桶表,通过子查询的方式
insert into table stu_buck
select id, name from stu;
(6)需要设置一个属性,hive2.3.6已经没有这个参数了,而且默认已经开启了分桶
set hive.enforce.bucketing=true;
(7)查询分桶表的数据
通过直接查看表数据能看出来,这个版本的hive分桶表每桶的数据已经按照倒序排列
hive (onhive)> select * from stu_buck;
OK
stu_buck.id stu_buck.name
120 s120
112 s112
104 s104
100 s100
105 s105
101 s101
110 s110
106 s106
102 s102
103 s103
2.分桶抽样查询
查询第一桶数据
hive (onhive)> select * from stu_buck tablesample(bucket 1 out of 4 on id);
OK
stu_buck.id stu_buck.name
120 s120
112 s112
104 s104
100 s100
tablesample(bucket x out of y)
x:表示从第几桶开始抽数据(1,2,3,4)
y:表示抽数据的比例,是抽数据的分母
比如: 有4个分桶
tablesample(bucket 1 out of 16) 表示从第一桶开始抽数据,抽取第一桶数据的比例为(4(桶数)/16(分母))=1/4,抽取第一桶四分之一的数据
tablesample(bucket 2 out of 32) 表示从第二桶开始抽数据,抽取第二桶数据的比例为(4(桶数)/32(分母))=1/8,抽取第一桶八分之一的数据
注意:x的值必须小于等于y的值,否则
FAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buck
3.数据块抽样
Hive提供了另外一种按照百分比进行抽样的方式,这种是基于行数的,按照输入路径下的数据块百分比进行的抽样。
hive (default)> select * from stu tablesample(0.1 percent) ;
提示:这种抽样方式不一定适用于所有的文件格式。另外,这种抽样的最小抽样单元是一个HDFS数据块。因此,如果表的数据大小小于普通的块大小128M的话,那么将会返回所有行。