DDL操作
Database
-
查看数据库
0: jdbc:hive2://CentOS:10000> show databases; +----------------+--+ | database_name | +----------------+--+ | default | | test | +----------------+--+ 2 rows selected (0.073 seconds)
-
使用数据库
0: jdbc:hive2://CentOS:10000> use test;
No rows affected (0.12 seconds)
-
新建数据库
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name --DATABASE|SCHEMA 是等价的 [COMMENT database_comment] --数据库注释 [LOCATION hdfs_path] --存储在 HDFS 上的位置 [WITH DBPROPERTIES (property_name=property_value, ...)]; --指定额外属性
0: jdbc:hive2://CentOS:10000> create schema if not exists baizhi; No rows affected (0.094 seconds) 0: jdbc:hive2://CentOS:10000> create schema if not exists zpark location 'hdfs:///hive_db/zpark.db'; No rows affected (0.156 seconds)
-
查看数据库信息
0: jdbc:hive2://CentOS:10000> desc database zpark; +----------+----------+--------------------------------------+-------------+-------------+-------------+--+ | db_name | comment | location | owner_name | owner_type | parameters | +----------+----------+--------------------------------------+-------------+-------------+-------------+--+ | zpark | | hdfs://CentOS:9000/hive_db/zpark.db | root | USER | | +----------+----------+--------------------------------------+-------------+-------------+-------------+--+
-
删除数据库
DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];
0: jdbc:hive2://CentOS:10000> drop database zpark RESTRICT; No rows affected (0.296 seconds) 0: jdbc:hive2://CentOS:10000> show databases; +----------------+--+ | database_name | +----------------+--+ | baizhi | | default | | test | +----------------+--+ 3 rows selected (0.091 seconds)
0: jdbc:hive2://CentOS:10000> drop database test RESTRICT; Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. InvalidOperationException(message:Database test is not empty. One or more tables exist.) (state=08S01,code=1) 0: jdbc:hive2://CentOS:10000> drop database test cascade;--级联删除表数据 No rows affected (1.821 seconds) 0: jdbc:hive2://CentOS:10000>
-
查看当前库
0: jdbc:hive2://CentOS:10000> select current_database(); +-------+--+ | _c0 | +-------+--+ | test | +-------+--+ 1 row selected (0.637 seconds)
Table(重点)
创建表
- 管理表
内部表也称之为MANAGED_TABLE;默认存储在/user/hive/warehouse下,也可以通过location指定;删除表时,会删除表数据以及元数据
;
create table if not exists t_user(
id int,
name string,
sex boolean,
age int,
salary double,
hobbies array<string>,
card map<string,string>,
address struct<country:string,city:string>
)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
- 外部表
外部表称之为EXTERNAL_TABLE;在创建表时可以自己指定目录位置(LOCATION);删除表时,只会删除元数据不会删除表数据;
create external table if not exists t_user1(
id int,
name string,
sex boolean,
age int,
salary double,
hobbies array<string>,
card map<string,string>,
address struct<country:string,city:string>
)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile
location 'hdfs:///hive_db/t_user';
- 分区表
Hive中的表对应为HDFS上的指定目录,在查询数据时候,默认会对全表进行扫描,这样时间和性能的消耗都非常大。分区为HDFS上表目录的子目录,数据按照分区存储在子目录中。如果查询的where子句的中包含分区条件,则直接从该分区去查找,而不是扫描整个表目录,合理的分区设计可以极大提高查询速度和性能。这里说明一下分区表并非Hive独有的概念,实际上这个概念非常常见。比如在我们常用的Oracle数据库中,当表中的数据量不断增大,查询数据的速度就会下降,这时也可以对表进行分区。表进行分区后,逻辑上表仍然是一张完整的表,只是将表中的数据存放到多个表空间(物理文件上),这样查询数据时,就不必要每次都扫描整张表,从而提升查询性能。在Hive中可以使用PARTITIONED BY子句创建分区表。表可以包含一个或多个分区列,程序会为分区列中的每个不同值组合创建单独的数据目录。
CREATE EXTERNAL TABLE t_employee(
id INT,
name STRING,
job STRING,
manager INT,
hiredate TIMESTAMP,
salary DECIMAL(7,2)
)
PARTITIONED BY (deptno INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LOCATION '/hive/t_employee';
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.00
7499 ALLEN SALESMAN 7698 1981-02-20 00:00:00 1600.00 300.00
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.00 500.00
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.00
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.00 1400.00
7698 BLAKE MANAGER 7839 1981-05-01 00:00:00 2850.00
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.00
7788 SCOTT ANALYST 7566 1987-04-19 00:00:00 1500.00
7839 KING PRESIDENT 1981-11-17 00:00:00 5000.00
7844 TURNER SALESMAN 7698 1981-09-08 00:00:00 1500.00 0.00
7876 ADAMS CLERK 7788 1987-05-23 00:00:00 1100.00
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.00
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.00
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.00
0: jdbc:hive2://CentOS:10000> load data local inpath '/root/baizhi/t_employee' overwrite into table t_employee partition(deptno=10);
- 分桶表
分区表是为了将文件按照分区文件夹进行粗粒度文件隔离,但是分桶表是将数据按照某个字段进行hash计算出所属的桶,然后在对桶内的数据进行排序 。
CREATE EXTERNAL TABLE t_employee_bucket(
id INT,
name STRING,
job STRING,
manager INT,
hiredate TIMESTAMP,
salary DECIMAL(7,2),
deptno INT)
CLUSTERED BY(id) SORTED BY(salary ASC) INTO 6 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LOCATION '/hive/employee_bucket';
0: jdbc:hive2://CentOS:10000> set hive.enforce.bucketing = true;
No rows affected (0.024 seconds)
0: jdbc:hive2://CentOS:10000> INSERT INTO TABLE t_employee_bucket(id,name,job,manager,hiredate,salary,deptno) SELECT id,name,job,manager,hiredate,salary,deptno FROM t_employee where deptno=10;
CLUSTERED BY(id)表示按照ID进行取值bucket,SORTED BY,当ID一样的时候按照salary排序。注意使用桶表
- 临时表
临时表仅对当前session(一个Connection有效)可见,临时表的数据将存储在用户的暂存目录中,并在会话结束后删除。如果临时表与永久表表名相同,则对该表名的任何引用都将解析为临时表,而不是永久表。临时表还具有以下两个限制:不支持分区列;不支持创建索引.
CREATE TEMPORARY TABLE if not exists emp_temp(
id INT,
name STRING,
job STRING,
manager INT,
hiredate TIMESTAMP,
salary DECIMAL(7,2),
deptno INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
LOCATION '/hive/emp_temp';
- CTAS创建表
0: jdbc:hive2://CentOS:10000> create TEMPORARY table t_tem as select * from t_employee where deptno=10;
0: jdbc:hive2://CentOS:10000> select * from t_tem;
+-----------+-------------+------------+----------------+------------------------+---------------+---------------+--+
| t_tem.id | t_tem.name | t_tem.job | t_tem.manager | t_tem.hiredate | t_tem.salary | t_tem.deptno |
+-----------+-------------+------------+----------------+------------------------+---------------+---------------+--+
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 00:00:00.0 | 800 | 10 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 00:00:00.0 | 1600 | 10 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 00:00:00.0 | 1250 | 10 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 00:00:00.0 | 2975 | 10 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 00:00:00.0 | 1250 | 10 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 00:00:00.0 | 2850 | 10 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 00:00:00.0 | 2450 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-04-19 00:00:00.0 | 1500 | 10 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 00:00:00.0 | 5000 | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 00:00:00.0 | 1500 | 10 |
| 7876 | ADAMS | CLERK | 7788 | 1987-05-23 00:00:00.0 | 1100 | 10 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 00:00:00.0 | 950 | 10 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 00:00:00.0 | 3000 | 10 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 00:00:00.0 | 1300 | 10 |
+-----------+-------------+------------+----------------+------------------------+---------------+---------------+--+
- 复制表结构
CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS t_employee_copy2 LIKE t_employee_bucket location '/hive/t_empolyee_copy2'
仅仅复制表结构,并不拷贝数据。
修改表
- 重命名
0: jdbc:hive2://CentOS:10000> ALTER TABLE t_user1 RENAME TO t_user;
No rows affected (0.224 seconds)
0: jdbc:hive2://CentOS:10000> show tables;
+--------------------+--+
| tab_name |
+--------------------+--+
| t_employee |
| t_employee_bucket |
| t_employee_copy |
| t_employee_copy2 |
| t_employee_copy3 |
| t_tem |
| t_user |
+--------------------+--+
7 rows selected (0.088 seconds)
- 修改列(修改类型、顺序、新增)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name | data_type | comment |
+-----------+-------------------------------------+----------+--+
| id | int | |
| name | string | |
| sex | boolean | |
| age | int | |
| salary | double | |
| hobbies | array<string> | |
| card | map<string,string> | |
| address | struct<country:string,city:string> | |
+-----------+-------------------------------------+----------+--+
8 rows selected (0.173 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_user change id id bigint;
No rows affected (0.226 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name | data_type | comment |
+-----------+-------------------------------------+----------+--+
| id | bigint | |
| name | string | |
| sex | boolean | |
| age | int | |
| salary | double | |
| hobbies | array<string> | |
| card | map<string,string> | |
| address | struct<country:string,city:string> | |
+-----------+-------------------------------------+----------+--+
8 rows selected (0.194 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_user change id id bigint after name;
No rows affected (0.236 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name | data_type | comment |
+-----------+-------------------------------------+----------+--+
| name | string | |
| id | bigint | |
| sex | boolean | |
| age | int | |
| salary | double | |
| hobbies | array<string> | |
| card | map<string,string> | |
| address | struct<country:string,city:string> | |
+-----------+-------------------------------------+----------+--+
8 rows selected (0.174 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_user change name name string after id;
No rows affected (0.249 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name | data_type | comment |
+-----------+-------------------------------------+----------+--+
| id | bigint | |
| name | string | |
| sex | boolean | |
| age | int | |
| salary | double | |
| hobbies | array<string> | |
| card | map<string,string> | |
| address | struct<country:string,city:string> | |
+-----------+-------------------------------------+----------+--+
0: jdbc:hive2://CentOS:10000> alter table t_user add columns (deptno int);
No rows affected (0.256 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name | data_type | comment |
+-----------+-------------------------------------+----------+--+
| id | bigint | |
| name | string | |
| sex | boolean | |
| age | int | |
| salary | double | |
| hobbies | array<string> | |
| card | map<string,string> | |
| address | struct<country:string,city:string> | |
| deptno | int | |
+-----------+-------------------------------------+----------+--+
9 rows selected (0.137 seconds)
清空表
0: jdbc:hive2://CentOS:10000> truncate table t_employee_copy; -- 必须内部表
0: jdbc:hive2://CentOS:10000> truncate table t_employee partition(deptno=20);-- 截断分区
要求被截断的表必须是管理表(内部表)
删除
0: jdbc:hive2://CentOS:10000> drop table t_employee_copy3
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition |
+------------+--+
| deptno=10 |
| deptno=20 |
+------------+--+
2 rows selected (0.185 seconds)
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition |
+------------+--+
| deptno=10 |
| deptno=20 |
+------------+--+
2 rows selected (0.185 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_employee drop partition(deptno=20);
INFO : Dropped the partition deptno=20
No rows affected (0.212 seconds)
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition |
+------------+--+
| deptno=10 |
+------------+--+
1 row selected (0.209 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_employee add partition(deptno=20);
如果删除的是管理表,系统在删除表的时候会删除表里数据,如果用户开启HDFS的回收站功能,数据实际上移动到垃圾回收站,所以如果用户想跳过垃圾hhhh回收站
drop table 表名字 PURGE
;
附加:
core-site.xml
<property>
<name>fs.trash.interval</name>
<value>5</value> //最多容忍数据在垃圾回收站保留5分钟
</property>
DML操作
数据加载
- load data
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
CREATE EXTERNAL TABLE t_employee(
id INT,
name STRING,
job STRING,
manager INT,
hiredate TIMESTAMP,
salary DECIMAL(7,2)
)
PARTITIONED BY (deptno INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LOCATION '/hive/t_employee';
load data local inpath '/root/baizhi/t_employee' overwrite into table t_employee partition(deptno=10);
- 查询结果插入表中
*将查询结果插入表中 *
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]]
select_statement1 FROM from_statement; --覆盖
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)]
select_statement1 FROM from_statement;--追加
{"EMPNO": 7369,"ENAME": "SMITH","JOB": "CLERK","MGR": 7902,"HIREDATE": "1980-12-17 00:00:00","SAL": 800.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7499,"ENAME": "ALLEN","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-02-20 00:00:00","SAL": 1600.00,"COMM": 300.00,"DEPTNO": 30}
{"EMPNO": 7521,"ENAME": "WARD","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-02-22 00:00:00","SAL": 1250.00,"COMM": 500.00,"DEPTNO": 30}
{"EMPNO": 7566,"ENAME": "JONES","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-04-02 00:00:00","SAL": 2975.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7654,"ENAME": "MARTIN","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-09-28 00:00:00","SAL": 1250.00,"COMM": 1400.00,"DEPTNO": 30}
{"EMPNO": 7698,"ENAME": "BLAKE","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-05-01 00:00:00","SAL": 2850.00,"COMM": null,"DEPTNO": 30}
{"EMPNO": 7782,"ENAME": "CLARK","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-06-09 00:00:00","SAL": 2450.00,"COMM": null,"DEPTNO": 10}
{"EMPNO": 7788,"ENAME": "SCOTT","JOB": "ANALYST","MGR": 7566,"HIREDATE": "1987-04-19 00:00:00","SAL": 1500.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7839,"ENAME": "KING","JOB": "PRESIDENT","MGR": null,"HIREDATE": "1981-11-17 00:00:00","SAL": 5000.00,"COMM": null,"DEPTNO": 10}
{"EMPNO": 7844,"ENAME": "TURNER","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-09-08 00:00:00","SAL": 1500.00,"COMM": 0.00,"DEPTNO": 30}
{"EMPNO": 7876,"ENAME": "ADAMS","JOB": "CLERK","MGR": 7788,"HIREDATE": "1987-05-23 00:00:00","SAL": 1100.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7900,"ENAME": "JAMES","JOB": "CLERK","MGR": 7698,"HIREDATE": "1981-12-03 00:00:00","SAL": 950.00,"COMM": null,"DEPTNO": 30}
{"EMPNO": 7902,"ENAME": "FORD","JOB": "ANALYST","MGR": 7566,"HIREDATE": "1981-12-03 00:00:00","SAL": 3000.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7934,"ENAME": "MILLER","JOB": "CLERK","MGR": 7782,"HIREDATE": "1982-01-23 00:00:00","SAL": 1300.00,"COMM": null,"DEPTNO": 10}
0: jdbc:hive2://CentOS:10000> add jar /usr/apache-hive-1.2.2-bin/hcatalog/share/hcatalog/hive-hcatalog-core-1.2.2.jar;
CREATE EXTERNAL TABLE t_emp_json(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2),
deptno INT)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';
0: jdbc:hive2://CentOS:10000> load data local inpath '/root/baizhi/t_emp_json' overwrite into table t_emp_json;
0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno from t_emp_json;
CREATE TABLE t_employee(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2)
)
PARTITIONED BY (deptno INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee PARTITION (deptno=10) select empno,ename,job,mgr,hiredate,sal,comm from t_emp_json where deptno=10;
0: jdbc:hive2://CentOS:10000> INSERT INTO TABLE t_employee PARTITION (deptno=10) select empno,ename,job,mgr,hiredate,sal,comm from t_emp_json where deptno=10;
更常见的做法
0: jdbc:hive2://CentOS:10000> create TEMPORARY table t_tmp as select empno,ename,job,mgr,hiredate,sal,comm,deptno from t_emp_json where deptno=10;
将查询结果插入多个表
FROM from_statement
INSERT OVERWRITE TABLE tablename1
[PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
from t_emp_json
INSERT OVERWRITE TABLE t_employee partition(deptno=10) select empno,ename,job,mgr,hiredate,sal,comm where deptno=10
INSERT OVERWRITE TABLE t_employee partition(deptno=20) select empno,ename,job,mgr,hiredate,sal,comm where deptno=20
INSERT OVERWRITE TABLE t_employee partition(deptno=30) select empno,ename,job,mgr,hiredate,sal,comm where deptno=30
插入动态分区
0: jdbc:hive2://CentOS:10000> set hive.exec.dynamic.partition.mode=nonstrict; #开启动态分区
No rows affected (0.016 seconds)
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee PARTITION (deptno) SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_emp_json ;
结果写出到文件系统
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
[ROW FORMAT row_format] [STORED AS file_format]
SELECT ... FROM ...
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE DIRECTORY 'D:\桌面文件\img' row format delimited fields terminated by ',' STORED AS textfile SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee;