HiveQL编程-琐碎03

最新推荐文章于 2024-02-21 14:53:49 发布

whx@Flora

最新推荐文章于 2024-02-21 14:53:49 发布

阅读量439

点赞数

分类专栏： hive

本文链接：https://blog.csdn.net/canyueduyi/article/details/100942378

版权

hive 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

HiveQL数据操作

1.向表中装载数据

 load data local inpath '/data/employees'
 overwrite into table employees
 partition (country='US',state='CA')

如果分区目录不存在，此命令会自动创建分区目录，再将文件拷贝到分区目录下
如果目标表是非分区表，则省略partition子句
Local关键字表示本地目录，省略则表示HDFS中的目录
overwrite关键字表示覆盖性的对表数据进行装载

2.通过查询语句向表中插入数据

hive> insert overwrite table employees05 select * from employees;  #将查询结果插入
hive> create table employees05 as select * from employees;  #创建表的同时插入数据

#从现有表向分区表插入数据

create table staged_employees(name string,country string,state string);

insert into staged_employees 
values('zhao','China','OR'),
('qian','US','OR'),
('sun','US','CA'),
('li','US','IL'),
('zhou','US','OR'),
('wu','US','CA'),
('zheng','US','IL');

create table employees05(name string) 
partitioned by (country string,state string) 
row format delimited lines terminated by '\n';

insert overwrite table employees05
partition(country='US',state='OR')
select name from staged_employees se 
where se.country='US' and se.state='OR';

#一次扫描数据，同时使用多种方式处理数据（目标表可以是分区表或是非分区表均可）

from staged_employees se
insert overwrite table employees05
   partition(country='US',state='OR')
   select name  where se.country='US' and se.state='OR'
insert overwrite table employees05
   partition(country='US',state='CA')
   select name where se.country='US' and se.state='CA'
insert overwrite table employees05
   partition(country='US',state='IL')
   select name where se.country='US' and se.state='IL';

执行结果如下

hive (onhive)> select * from employees05;
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

hive (onhive)> show partitions employees05;
OK
partition
country=US/state=CA
country=US/state=IL
country=US/state=OR

#动态分区插入
假设staged_emloyees表中存在100个国家和州的话，执行下边的查询成功后，表employees05表中将会有100个分区。

hive (onhive)> insert overwrite  table employees05
             > partition(country,state)
             > select name,country,state from staged_employees
             > where country='US';
FAILED: SemanticException [Error 10096]: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict

此处报错：原因是此版本（hive2.3.6）动态分区功能默认情况下是启用的，以“严格”模式执行，在这种模式下要求至少有一列分区字段是静态的。这种“严格”模式有助于阻止因涉及错误导致查询产生大量的分区。

#默认的动态分区属性

[root@bigdata hive_query]# hive -S -e "set" |grep 'hive.exec.*dynamic.*'
hive.exec.dynamic.partition=true  #设置成true，表示开启动态分区功能
hive.exec.dynamic.partition.mode=strict #设置成nonstrict，表示允许所有分区都是动态的
hive.exec.max.dynamic.partitions=1000   #一个动态分区创建语句可以创建的最大动态分区个数
hive.exec.max.dynamic.partitions.pernode=100   #每个mapper或reducer可以创建的最大动态分区个数

[root@bigdata hive_query]# hive -S -e "set" |grep 'hive.exec.max.created.files' 
hive.exec.max.created.files=100000    #全局可以创建的最大文件个数，有一个Hadoop计数器会跟踪记录创建了多少个文件。

#动、静态分区混用插入

静态分区键必须出现在动态分区键之前

insert overwrite  table employees05
partition(country='US',state)
select name,state from staged_employees
where country='US';

hive (onhive)> select * from employees05;
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

hive (onhive)> show partitions employees05;
OK
partition
country=US/state=CA
country=US/state=IL
country=US/state=OR

3.导出数据

目的：从表中导出数据
#1）数据文件是用户需要的格式：拷贝文件夹或文件
hadoop fs -cp source_path target_path
#2）输出自定义数据格式

hive (onhive)> insert overwrite local directory '/tmp/ca_employees'
             > select * from employees05 where state='CA';

[root@bigdata hive_query]# cd /tmp/ca_employees/
[root@bigdata ca_employees]# ls
000000_0

 hive (onhive)> !ls /tmp/ca_employees;
000000_0
hive (onhive)> !cat /tmp/ca_employees/000000_0;
sunUSCA
wuUSCA

和向表中插入数据一样，用户也可以通过如下方式指定多个输出文件夹目录：

--输出到HDFS
hive (onhive)> from staged_employees se
             > insert overwrite directory '/tmp/or_emp'
             >   select * where se.country='US' and se.state='OR'
             > insert overwrite directory '/tmp/ca_emp'
             >   select * where se.country='US' and se.state='CA'
             > insert overwrite directory '/tmp/IL_emp'
             >   select * where se.country='US' and se.state='IL';

hive (onhive)> dfs -ls /tmp/*emp;
Found 1 items
-rwx-wx-wx   3 root supergroup         21 2019-09-17 14:16 /tmp/IL_emp/000000_0
Found 1 items
-rwx-wx-wx   3 root supergroup         19 2019-09-17 14:16 /tmp/ca_emp/000000_0
Found 1 items
-rwx-wx-wx   3 root supergroup         22 2019-09-17 14:16 /tmp/or_emp/000000_0

--输出到本地
hive (onhive)> from staged_employees se
             > insert overwrite local directory '/tmp/or_emp'
             >   select * where se.country='US' and se.state='OR'
             > insert overwrite local directory '/tmp/ca_emp'
             >   select * where se.country='US' and se.state='CA'
             > insert overwrite local directory '/tmp/IL_emp'
             >   select * where se.country='US' and se.state='IL';
 
[root@bigdata ca_employees]# ls -l /tmp |grep emp 
drwxr-xr-x  2 root  root     43 Sep 17 14:29 ca_emp
drwxr-xr-x  2 root  root     43 Sep 17 13:52 ca_employees
drwxr-xr-x  2 root  root     43 Sep 17 14:29 IL_emp
drwxr-xr-x  2 root  root     43 Sep 17 14:29 or_emp

HiveQL查询

1.SELECT …FROM

hive (onhive)> 
             > desc employees05;
OK
col_name        data_type       comment
name                    string                                      
country                 string                                      
state                   string                                      
                 
# Partition Information          
# col_name              data_type               comment             
                 
country                 string                                      
state                   string                                      
Time taken: 3.062 seconds, Fetched: 9 row(s)
--查询过程中表，字段别名的使用与Oracle类似，AS关键字可用或不用
hive (onhive)> select em.name n,em.country c,em.state AS s from employees05 em;
OK
n       c       s
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

2.JSON类型的查询（MAP，ARRAY，STRUCT）
此处可以回顾【hive编程-琐碎02】中的内容

CREATE  TABLE onhive.employees(
name STRING,
sa1ary FLOAT,
subordinates ARRAY<STRING>,
deductions MAP<STRING, FLOAT>,
address STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','    -- 列分隔符
COLLECTION ITEMS TERMINATED BY '_'  --MAP STRUCT 和 ARRAY 的分隔符(数据分割符号)
MAP KEYS TERMINATED BY ':' -- MAP中的key与value的分隔符
LINES TERMINATED BY '\n';  -- 行分隔符

测试数据：

vi test.txt
John Doe,100000.0,Mary Smith_Todd Jones,Federal Taxes:0.2_State Taxes:0.05_Insurance:0.1,1 Michigan Ave._Chicago_1L_60600
Tom Smith,90000.0,Jan_Hello Ketty,Federal Taxes:0.2_State Taxes:0.05_Insurance:0.1,Guang dong._China_0.5L_60661

ps:注意，MAP,STRUCT和ARRAY里的元素间关系都可以用同一个字符表示，这里用“_”。
导入数据

load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/test.txt' into table onhive.employees;

#local表示本地路径中寻找目标文件，不加local则到HDFS目录寻找

hive> select * from employees;
OK
employees.name  employees.sa1ary        employees.subordinates  employees.deductions    employees.address
John Doe        100000.0        ["Mary Smith","Todd Jones"]     {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}        {"street":"1 Michigan Ave.","city":"Chicago","state":"1L","zip":60600}
Tom Smith       90000.0 ["Jan","Hello Ketty"]   {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}        {"street":"Guang dong.","city":"China","state":"0.5L","zip":60661}
Time taken: 3.839 seconds, Fetched: 2 row(s)

#Array，Map，Struct的数据取用方式：

hive> select subordinates[0],deductions['Federal Taxes'],address.city from employees;
OK
_c0     _c1     city
Mary Smith      0.2     Chicago
Jan     0.2     China
Time taken: 0.841 seconds, Fetched: 2 row(s)

3.Limit子句
LIMIT子句用于限制返回行数,oracle中可使用where rownum<3控制

hive (onhive)> select em.name n,em.country c, em.state as s from employees05 em  limit 3;
OK
n       c       s
sun     US      CA
wu      US      CA
li      US      IL

4.Like and RLike
1)Like 用于模糊匹配类似的值
2）%表示0或多个任意字符
_代表一个字符
3）RLike子句是Hive中的扩展功能，可以通过Java正则表达式这个更强大的语言来指定匹配条件

hive (onhive)> select * from employees05 where state rlike '[C]';
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA

hive (onhive)> select * from employees05 where state like 'C%';
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA

5.GROUP BY
使用方法与Oracle一致，出现在select子句中的字段而不在group by中的字段必须在分组函数中，否则报错

hive (onhive)> select country,state,count(*) from staged_employees group by country,state;
country state   _c2
China   OR      1
US      CA      2
US      IL      2
US      OR      2

6.Having
having 与where的不同点：
1）where针对表中列筛选数据；having针对查询结果的列筛选数据
2）where子句不能有分组函数；having可以
3）使用having必须存在group by

hive (onhive)> select country,state,count(*) from staged_employees group by country,state having count(*) >1;
country state   _c2
US      CA      2
US      IL      2
US      OR      2

7.CASE … WHEN … THEN

hive (onhive)> 
             > select name,sa1ary,
             > case 
             >   when sa1ary <5000.0 then 'low'
             >   when sa1ary >=5000.0 and sa1ary<7000.0 then 'mid'
             >   when sa1ary>=7000.0 then 'high'
             > end as bracket from employees;
OK
name    sa1ary  bracket
John Doe        100000.0        high
Tom Smith       90000.0 high

8.避免使用MapReduce的情况（本地模式）
1）对表的简单读取

select * from tab_name;

2)where 语句过滤条件中只是分区字段（无论是否使用limit限制输出记录条数)

select * from employees05 where country ='US' and state='CA' limit 100;

3)属性hive.exec.mode.local.auto值设置成true，Hive还会尝试使用本地模式执行其他的操作。

hive (onhive)> set hive.exec.mode.local.auto;
hive.exec.mode.local.auto=false

9.浮点数的比较
对于一个float类型，用户输入0.2，实际hive存储的值为0.2000001
对于一个double类型，用户输入0.2，实际hive存储的值为0.200000000000001
在查询时过滤条件为 where float_col>0.2 ,此处的0.2对于hive是存储成double类型的
总之在浮点数中0.2的实际值>0.2，而且，cast(0.2 as float)> cast(0.2 as double)
所以在浮点数尤其是float和double类型进行比较时容易出现错误
解决：
1)显示转换成统一类型的浮点数进行比较 cast(0.2 as float)
2)在金钱相关的列避免使用浮点数

Join

1.等值Join
Hive支持通常的SQL JOIN语句，但是只支持等值连接，不支持非等值连接，连接谓词中不支持or

hive (onhive)> select se.name from staged_employees se,employees05 e where se.name=e.name;
--等价于
hive (onhive)> select se.name from staged_employees se join employees05 e on se.name=e.name;

2.left join

A left join B ：返回的集合行数以连接左边的表A为准，集合中B表的部分，保留A与B的交集，与A不匹配的行以NULL填充

#为测试实验效果重新装载stage_employees

[root@bigdata hive_query]# vi test01.txt 
zhang,China,CA
wang,US,CA
zhao,UK,BE
lin,China,CA
lang,US,CA
wu,US,CA
li,US,IL
zheng,US,IL
qian,US,OR
zhou,US,OR

[root@bigdata hive_query]# hive -S -e "load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/test01.txt' overwrite into table staged_employees";

hive (onhive)> select * from employees05;
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se left join employees05 e on se.name = e.name; 
se.name se.country      se.state        e.name  e.country       e.state
zhang   China   CA      NULL    NULL    NULL
wang    US      CA      NULL    NULL    NULL
zhao    UK      BE      NULL    NULL    NULL
lin     China   CA      NULL    NULL    NULL
lang    US      CA      NULL    NULL    NULL
wu      US      CA      wu      US      CA
li      US      IL      li      US      IL
zheng   US      IL      zheng   US      IL
qian    US      OR      qian    US      OR
zhou    US      OR      zhou    US      OR

3.rigth join

A rigth join B ：返回的集合行数以连接右边的表B为准，集合中A表的部分，保留A与B的交集，与B不匹配的行以NULL填充

hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se right join employees05 e on se.name = e.name; 
se.name se.country      se.state        e.name  e.country       e.state
NULL    NULL    NULL    sun     US      CA
wu      US      CA      wu      US      CA
li      US      IL      li      US      IL
zheng   US      IL      zheng   US      IL
qian    US      OR      qian    US      OR
zhou    US      OR      zhou    US      OR

4.full join

A full join B ：返回A与B的并集，A与B中不与彼此匹配的行以NULL填充

hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se right join employees05 e on se.name = e.name; 
se.name se.country      se.state        e.name  e.country       e.state
lang    US      CA      NULL    NULL    NULL
li      US      IL      li      US      IL
lin     China   CA      NULL    NULL    NULL
qian    US      OR      qian    US      OR
NULL    NULL    NULL    sun     US      CA
wang    US      CA      NULL    NULL    NULL
wu      US      CA      wu      US      CA
zhang   China   CA      NULL    NULL    NULL
zhao    UK      BE      NULL    NULL    NULL
zheng   US      IL      zheng   US      IL
zhou    US      OR      zhou    US      OR

5.每个MapReduce内部排序（Sort By）
Sort By：每个MapReduce内部进行排序，（区内排序）对全局结果集来说不是排序。

1）设置reduce个数

hive (default)> set mapreduce.job.reduces=3;

2）查看设置reduce个数

hive (default)> set mapreduce.job.reduces;

hive (onhive)> select * from staged_employees sort by name;

6.分区排序（Distribute By）

Distribute By：类似MR中partition，进行分区，结合sort by使用。
注意，Hive要求DISTRIBUTE BY语句要写在SORT BY语句之前。

对于distribute by进行测试，一定要分配多reduce进行处理，否则无法看到distribute by的效果。

hive (onhive)> select * from staged_employees distribute by country sort by state;

7.Cluster By
当distribute by和sorts by字段相同时，可以使用cluster by方式。

cluster by除了具有distribute by的功能外还兼具sort by的功能。但是排序只能是倒序排序，不能指定排序规则为ASC或者DESC。

1）以下两种写法等价

hive (default)> select * from emp cluster by deptno;

hive (default)> select * from emp distribute by deptno sort by deptno;

注意：按照部门编号分区，不一定就是固定死的数值，可以是20号和30号部门分到一个分区里面去。

hive (onhive)> select * from staged_employees distribute by country sort by country;	
staged_employees.name   staged_employees.country        staged_employees.state

lin     China   CA
zhang   China   CA
zhou    US      OR
qian    US      OR
zheng   US      IL
li      US      IL
wu      US      CA
lang    US      CA
wang    US      CA
zhao    UK      BE

分桶及抽样查询

1.分桶表数据存储##

分区针对的是数据的存储路径；分桶针对的是数据文件。

分区提供一个隔离数据和优化查询的便利方式。不过，并非所有的数据集都可形成合理的分区，特别是之前所提到过的要确定合适的划分大小这个疑虑。

分桶是将数据集分解成更容易管理的若干部分的另一个技术。
（1）创建分桶表

create table stu_buck(id int, name string)

clustered by(id)

into 4 buckets

row format delimited fields terminated by '\t';

（2）查看表结构

hive (onhive)> desc formatted stu_buck;

Num Buckets:            4

（3）准备数据

[root@bigdata hive_query]# vi stu_buck.txt 
100     s100
101     s1o1
102     s102
103     s103
104     s104
105     s105
106     s106
110     s110
112     s112
120     s120

（4）导入数据-失败

hive (onhive)> load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/stu_buck.txt' into table stu_buck;
FAILED: SemanticException Please load into an intermediate table and use 'insert... select' to allow Hive to enforce bucketing. Load into bucketed tables are disabled for safety reasons. If you know what you are doing, please sethive.strict.checks.bucketing to false and that hive.mapred.mode is not set to 'strict' to proceed. Note that if you may get errors or incorrect results if you make a mistake while using some of the unsafe features.
--此处提示：出于安全考虑，直接向buck表导入数据默认是不允许的，可以使用insert..select 方式进行导入数据，如果坚持直接导入这种操作需要设置参数hive.strict.checks.bucketing to false；hive.mapred.mode is not set to 'strict'

换个策略：创建普通表，并用子查询方式导入数据
（1）先建一个普通的stu表

create table stu(id int, name string)

row format delimited fields terminated by '\t';

（2）向普通的stu表中导入数据

load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/stu_buck.txt' into table stu;

（3）清空stu_buck表中数据

truncate table stu_buck;

select * from stu_buck;

（4）导入数据到分桶表，通过子查询的方式

insert into table stu_buck

select id, name from stu;

（6）需要设置一个属性,hive2.3.6已经没有这个参数了，而且默认已经开启了分桶

set hive.enforce.bucketing=true;

（7）查询分桶表的数据

通过直接查看表数据能看出来，这个版本的hive分桶表每桶的数据已经按照倒序排列

hive (onhive)> select * from stu_buck;
OK
stu_buck.id     stu_buck.name
120     s120
112     s112
104     s104
100     s100
105     s105
101     s101
110     s110
106     s106
102     s102
103     s103

2.分桶抽样查询

查询第一桶数据

hive (onhive)> select * from stu_buck tablesample(bucket 1 out of 4 on id);
OK
stu_buck.id     stu_buck.name
120     s120
112     s112
104     s104
100     s100

tablesample(bucket x out of y)
x:表示从第几桶开始抽数据(1,2,3,4)
y:表示抽数据的比例,是抽数据的分母
比如: 有4个分桶
tablesample(bucket 1 out of 16)　　表示从第一桶开始抽数据，抽取第一桶数据的比例为(4(桶数)/16(分母))=1/4，抽取第一桶四分之一的数据
tablesample(bucket 2 out of 32)　　表示从第二桶开始抽数据，抽取第二桶数据的比例为(4(桶数)/32(分母))=1/8，抽取第一桶八分之一的数据

注意：x的值必须小于等于y的值，否则

FAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buck

3.数据块抽样
Hive提供了另外一种按照百分比进行抽样的方式，这种是基于行数的，按照输入路径下的数据块百分比进行的抽样。

hive (default)> select * from stu tablesample(0.1 percent) ;

提示：这种抽样方式不一定适用于所有的文件格式。另外，这种抽样的最小抽样单元是一个HDFS数据块。因此，如果表的数据大小小于普通的块大小128M的话，那么将会返回所有行。

whx@Flora

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录