Hive: ------ DDL、Table管理、DML

DDL操作

Database

  • 查看数据库
    0: jdbc:hive2://CentOS:10000> show databases;
    +----------------+--+
    | database_name  |
    +----------------+--+
    | default        |
    | test           |
    +----------------+--+
    2 rows selected (0.073 seconds)
    
  • 使用数据库
0: jdbc:hive2://CentOS:10000> use test;
No rows affected (0.12 seconds)
  • 新建数据库
    CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name   --DATABASE|SCHEMA 是等价的
      [COMMENT database_comment] --数据库注释
      [LOCATION hdfs_path] --存储在 HDFS 上的位置
      [WITH DBPROPERTIES (property_name=property_value, ...)]; --指定额外属性
    
    0: jdbc:hive2://CentOS:10000> create schema if not exists baizhi;
    No rows affected (0.094 seconds)
    0: jdbc:hive2://CentOS:10000> create schema if not exists zpark location 'hdfs:///hive_db/zpark.db';
    No rows affected (0.156 seconds)
    
  • 查看数据库信息
    0: jdbc:hive2://CentOS:10000> desc database zpark;
    +----------+----------+--------------------------------------+-------------+-------------+-------------+--+
    | db_name  | comment  |               location               | owner_name  | owner_type  | parameters  |
    +----------+----------+--------------------------------------+-------------+-------------+-------------+--+
    | zpark    |          | hdfs://CentOS:9000/hive_db/zpark.db  | root        | USER        |             |
    +----------+----------+--------------------------------------+-------------+-------------+-------------+--+
    
  • 删除数据库
    DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];
    
    0: jdbc:hive2://CentOS:10000> drop database zpark RESTRICT;
    No rows affected (0.296 seconds)
    0: jdbc:hive2://CentOS:10000> show databases;
    +----------------+--+
    | database_name  |
    +----------------+--+
    | baizhi         |
    | default        |
    | test           |
    +----------------+--+
    3 rows selected (0.091 seconds)
    
    0: jdbc:hive2://CentOS:10000> drop database test RESTRICT;
    Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. InvalidOperationException(message:Database test is not empty. One or more tables exist.) (state=08S01,code=1)
    
    0: jdbc:hive2://CentOS:10000> drop database test cascade;--级联删除表数据
    No rows affected (1.821 seconds)
    0: jdbc:hive2://CentOS:10000>
    
  • 查看当前库

    0: jdbc:hive2://CentOS:10000> select current_database();
    +-------+--+
    |  _c0  |
    +-------+--+
    | test  |
    +-------+--+
    1 row selected (0.637 seconds)
    

Table(重点)

创建表
  • 管理表

内部表也称之为MANAGED_TABLE;默认存储在/user/hive/warehouse下,也可以通过location指定;删除表时,会删除表数据以及元数据

create table if not exists t_user(
  id int,
  name string,
  sex boolean,
  age int,
  salary double,
  hobbies array<string>,
  card map<string,string>,
  address struct<country:string,city:string>
)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
  • 外部表

外部表称之为EXTERNAL_TABLE;在创建表时可以自己指定目录位置(LOCATION);删除表时,只会删除元数据不会删除表数据;

create external table if not exists t_user1(
  id int,
  name string,
  sex boolean,
  age int,
  salary double,
  hobbies array<string>,
  card map<string,string>,
  address struct<country:string,city:string>
)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile
location 'hdfs:///hive_db/t_user';
  • 分区表

Hive中的表对应为HDFS上的指定目录,在查询数据时候,默认会对全表进行扫描,这样时间和性能的消耗都非常大。分区为HDFS上表目录的子目录,数据按照分区存储在子目录中。如果查询的where子句的中包含分区条件,则直接从该分区去查找,而不是扫描整个表目录,合理的分区设计可以极大提高查询速度和性能。这里说明一下分区表并非Hive独有的概念,实际上这个概念非常常见。比如在我们常用的Oracle数据库中,当表中的数据量不断增大,查询数据的速度就会下降,这时也可以对表进行分区。表进行分区后,逻辑上表仍然是一张完整的表,只是将表中的数据存放到多个表空间(物理文件上),这样查询数据时,就不必要每次都扫描整张表,从而提升查询性能。在Hive中可以使用PARTITIONED BY子句创建分区表。表可以包含一个或多个分区列,程序会为分区列中的每个不同值组合创建单独的数据目录。

CREATE EXTERNAL TABLE t_employee(
    id INT,
    name STRING,
    job STRING,
    manager INT,
    hiredate TIMESTAMP,
    salary DECIMAL(7,2)
    )
    PARTITIONED BY (deptno INT)   
    ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY "\t"
    LOCATION '/hive/t_employee';
7369	SMITH	CLERK	7902	1980-12-17 00:00:00	800.00
7499	ALLEN	SALESMAN	7698	1981-02-20 00:00:00	1600.00	300.00
7521	WARD	SALESMAN	7698	1981-02-22 00:00:00	1250.00	500.00
7566	JONES	MANAGER	7839	1981-04-02 00:00:00	2975.00
7654	MARTIN	SALESMAN	7698	1981-09-28 00:00:00	1250.00	1400.00
7698	BLAKE	MANAGER	7839	1981-05-01 00:00:00	2850.00
7782	CLARK	MANAGER	7839	1981-06-09 00:00:00	2450.00
7788	SCOTT	ANALYST	7566	1987-04-19 00:00:00	1500.00
7839	KING	PRESIDENT		1981-11-17 00:00:00	5000.00
7844	TURNER	SALESMAN	7698	1981-09-08 00:00:00	1500.00	0.00
7876	ADAMS	CLERK	7788	1987-05-23 00:00:00	1100.00
7900	JAMES	CLERK	7698	1981-12-03 00:00:00	950.00
7902	FORD	ANALYST	7566	1981-12-03 00:00:00	3000.00
7934	MILLER	CLERK	7782	1982-01-23 00:00:00	1300.00
0: jdbc:hive2://CentOS:10000> load data local inpath '/root/baizhi/t_employee' overwrite into table t_employee partition(deptno=10);
  • 分桶表

分区表是为了将文件按照分区文件夹进行粗粒度文件隔离,但是分桶表是将数据按照某个字段进行hash计算出所属的桶,然后在对桶内的数据进行排序 。

CREATE EXTERNAL TABLE t_employee_bucket(
    id INT,
    name STRING,
    job STRING,
    manager INT,
    hiredate TIMESTAMP,
    salary DECIMAL(7,2),
    deptno INT)
    CLUSTERED BY(id) SORTED BY(salary ASC) INTO 6 BUCKETS  
    ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY "\t"
    LOCATION '/hive/employee_bucket';
0: jdbc:hive2://CentOS:10000> set hive.enforce.bucketing = true;
No rows affected (0.024 seconds)
0: jdbc:hive2://CentOS:10000> INSERT INTO TABLE t_employee_bucket(id,name,job,manager,hiredate,salary,deptno) SELECT id,name,job,manager,hiredate,salary,deptno  FROM t_employee where deptno=10;

CLUSTERED BY(id)表示按照ID进行取值bucket,SORTED BY,当ID一样的时候按照salary排序。注意使用桶表

  • 临时表

临时表仅对当前session(一个Connection有效)可见,临时表的数据将存储在用户的暂存目录中,并在会话结束后删除。如果临时表与永久表表名相同,则对该表名的任何引用都将解析为临时表,而不是永久表。临时表还具有以下两个限制:不支持分区列;不支持创建索引.

CREATE TEMPORARY TABLE if not exists emp_temp(
    id INT,
    name STRING,
    job STRING,
    manager INT,
    hiredate TIMESTAMP,
    salary DECIMAL(7,2),
    deptno INT
    )
    ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
    LOCATION '/hive/emp_temp';
  • CTAS创建表
0: jdbc:hive2://CentOS:10000> create TEMPORARY table t_tem as select * from t_employee where deptno=10;
0: jdbc:hive2://CentOS:10000> select * from t_tem;
+-----------+-------------+------------+----------------+------------------------+---------------+---------------+--+
| t_tem.id  | t_tem.name  | t_tem.job  | t_tem.manager  |     t_tem.hiredate     | t_tem.salary  | t_tem.deptno  |
+-----------+-------------+------------+----------------+------------------------+---------------+---------------+--+
| 7369      | SMITH       | CLERK      | 7902           | 1980-12-17 00:00:00.0  | 800           | 10            |
| 7499      | ALLEN       | SALESMAN   | 7698           | 1981-02-20 00:00:00.0  | 1600          | 10            |
| 7521      | WARD        | SALESMAN   | 7698           | 1981-02-22 00:00:00.0  | 1250          | 10            |
| 7566      | JONES       | MANAGER    | 7839           | 1981-04-02 00:00:00.0  | 2975          | 10            |
| 7654      | MARTIN      | SALESMAN   | 7698           | 1981-09-28 00:00:00.0  | 1250          | 10            |
| 7698      | BLAKE       | MANAGER    | 7839           | 1981-05-01 00:00:00.0  | 2850          | 10            |
| 7782      | CLARK       | MANAGER    | 7839           | 1981-06-09 00:00:00.0  | 2450          | 10            |
| 7788      | SCOTT       | ANALYST    | 7566           | 1987-04-19 00:00:00.0  | 1500          | 10            |
| 7839      | KING        | PRESIDENT  | NULL           | 1981-11-17 00:00:00.0  | 5000          | 10            |
| 7844      | TURNER      | SALESMAN   | 7698           | 1981-09-08 00:00:00.0  | 1500          | 10            |
| 7876      | ADAMS       | CLERK      | 7788           | 1987-05-23 00:00:00.0  | 1100          | 10            |
| 7900      | JAMES       | CLERK      | 7698           | 1981-12-03 00:00:00.0  | 950           | 10            |
| 7902      | FORD        | ANALYST    | 7566           | 1981-12-03 00:00:00.0  | 3000          | 10            |
| 7934      | MILLER      | CLERK      | 7782           | 1982-01-23 00:00:00.0  | 1300          | 10            |
+-----------+-------------+------------+----------------+------------------------+---------------+---------------+--+

  • 复制表结构
CREATE TEMPORARY EXTERNAL TABLE  IF NOT EXISTS  t_employee_copy2  LIKE t_employee_bucket location '/hive/t_empolyee_copy2'

仅仅复制表结构,并不拷贝数据。

修改表
  • 重命名
0: jdbc:hive2://CentOS:10000> ALTER TABLE t_user1 RENAME TO t_user;
No rows affected (0.224 seconds)
0: jdbc:hive2://CentOS:10000> show tables;
+--------------------+--+
|      tab_name      |
+--------------------+--+
| t_employee         |
| t_employee_bucket  |
| t_employee_copy    |
| t_employee_copy2   |
| t_employee_copy3   |
| t_tem              |
| t_user             |
+--------------------+--+
7 rows selected (0.088 seconds)
  • 修改列(修改类型、顺序、新增)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name  |              data_type              | comment  |
+-----------+-------------------------------------+----------+--+
| id        | int                                 |          |
| name      | string                              |          |
| sex       | boolean                             |          |
| age       | int                                 |          |
| salary    | double                              |          |
| hobbies   | array<string>                       |          |
| card      | map<string,string>                  |          |
| address   | struct<country:string,city:string>  |          |
+-----------+-------------------------------------+----------+--+
8 rows selected (0.173 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_user change id id bigint;
No rows affected (0.226 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name  |              data_type              | comment  |
+-----------+-------------------------------------+----------+--+
| id        | bigint                              |          |
| name      | string                              |          |
| sex       | boolean                             |          |
| age       | int                                 |          |
| salary    | double                              |          |
| hobbies   | array<string>                       |          |
| card      | map<string,string>                  |          |
| address   | struct<country:string,city:string>  |          |
+-----------+-------------------------------------+----------+--+
8 rows selected (0.194 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_user change id id bigint after name;
No rows affected (0.236 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name  |              data_type              | comment  |
+-----------+-------------------------------------+----------+--+
| name      | string                              |          |
| id        | bigint                              |          |
| sex       | boolean                             |          |
| age       | int                                 |          |
| salary    | double                              |          |
| hobbies   | array<string>                       |          |
| card      | map<string,string>                  |          |
| address   | struct<country:string,city:string>  |          |
+-----------+-------------------------------------+----------+--+
8 rows selected (0.174 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_user change name name string after id;
No rows affected (0.249 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name  |              data_type              | comment  |
+-----------+-------------------------------------+----------+--+
| id        | bigint                              |          |
| name      | string                              |          |
| sex       | boolean                             |          |
| age       | int                                 |          |
| salary    | double                              |          |
| hobbies   | array<string>                       |          |
| card      | map<string,string>                  |          |
| address   | struct<country:string,city:string>  |          |
+-----------+-------------------------------------+----------+--+
0: jdbc:hive2://CentOS:10000> alter table t_user add columns (deptno int);
No rows affected (0.256 seconds)
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name  |              data_type              | comment  |
+-----------+-------------------------------------+----------+--+
| id        | bigint                              |          |
| name      | string                              |          |
| sex       | boolean                             |          |
| age       | int                                 |          |
| salary    | double                              |          |
| hobbies   | array<string>                       |          |
| card      | map<string,string>                  |          |
| address   | struct<country:string,city:string>  |          |
| deptno    | int                                 |          |
+-----------+-------------------------------------+----------+--+
9 rows selected (0.137 seconds)
清空表
0: jdbc:hive2://CentOS:10000> truncate table t_employee_copy; -- 必须内部表
0: jdbc:hive2://CentOS:10000> truncate table t_employee partition(deptno=20);-- 截断分区

要求被截断的表必须是管理表(内部表)

删除
0: jdbc:hive2://CentOS:10000> drop table t_employee_copy3
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition  |
+------------+--+
| deptno=10  |
| deptno=20  |
+------------+--+
2 rows selected (0.185 seconds)
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition  |
+------------+--+
| deptno=10  |
| deptno=20  |
+------------+--+
2 rows selected (0.185 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_employee drop partition(deptno=20);
INFO  : Dropped the partition deptno=20
No rows affected (0.212 seconds)
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition  |
+------------+--+
| deptno=10  |
+------------+--+
1 row selected (0.209 seconds)
0: jdbc:hive2://CentOS:10000> alter table t_employee add partition(deptno=20);

如果删除的是管理表,系统在删除表的时候会删除表里数据,如果用户开启HDFS的回收站功能,数据实际上移动到垃圾回收站,所以如果用户想跳过垃圾hhhh回收站drop table 表名字 PURGE;

附加:

core-site.xml

<property>
  <name>fs.trash.interval</name>
  <value>5</value> //最多容忍数据在垃圾回收站保留5分钟
</property>

DML操作

数据加载

  • load data
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
CREATE EXTERNAL TABLE t_employee(
    id INT,
    name STRING,
    job STRING,
    manager INT,
    hiredate TIMESTAMP,
    salary DECIMAL(7,2)
    )
    PARTITIONED BY (deptno INT)   
    ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY "\t"
    LOCATION '/hive/t_employee';
load data local inpath '/root/baizhi/t_employee' overwrite into table t_employee partition(deptno=10);
  • 查询结果插入表中

*将查询结果插入表中 *

INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]]   
select_statement1 FROM from_statement; --覆盖

INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] 
select_statement1 FROM from_statement;--追加
{"EMPNO": 7369,"ENAME": "SMITH","JOB": "CLERK","MGR": 7902,"HIREDATE": "1980-12-17 00:00:00","SAL": 800.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7499,"ENAME": "ALLEN","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-02-20 00:00:00","SAL": 1600.00,"COMM": 300.00,"DEPTNO": 30}
{"EMPNO": 7521,"ENAME": "WARD","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-02-22 00:00:00","SAL": 1250.00,"COMM": 500.00,"DEPTNO": 30}
{"EMPNO": 7566,"ENAME": "JONES","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-04-02 00:00:00","SAL": 2975.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7654,"ENAME": "MARTIN","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-09-28 00:00:00","SAL": 1250.00,"COMM": 1400.00,"DEPTNO": 30}
{"EMPNO": 7698,"ENAME": "BLAKE","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-05-01 00:00:00","SAL": 2850.00,"COMM": null,"DEPTNO": 30}
{"EMPNO": 7782,"ENAME": "CLARK","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-06-09 00:00:00","SAL": 2450.00,"COMM": null,"DEPTNO": 10}
{"EMPNO": 7788,"ENAME": "SCOTT","JOB": "ANALYST","MGR": 7566,"HIREDATE": "1987-04-19 00:00:00","SAL": 1500.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7839,"ENAME": "KING","JOB": "PRESIDENT","MGR": null,"HIREDATE": "1981-11-17 00:00:00","SAL": 5000.00,"COMM": null,"DEPTNO": 10}
{"EMPNO": 7844,"ENAME": "TURNER","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-09-08 00:00:00","SAL": 1500.00,"COMM": 0.00,"DEPTNO": 30}
{"EMPNO": 7876,"ENAME": "ADAMS","JOB": "CLERK","MGR": 7788,"HIREDATE": "1987-05-23 00:00:00","SAL": 1100.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7900,"ENAME": "JAMES","JOB": "CLERK","MGR": 7698,"HIREDATE": "1981-12-03 00:00:00","SAL": 950.00,"COMM": null,"DEPTNO": 30}
{"EMPNO": 7902,"ENAME": "FORD","JOB": "ANALYST","MGR": 7566,"HIREDATE": "1981-12-03 00:00:00","SAL": 3000.00,"COMM": null,"DEPTNO": 20}
{"EMPNO": 7934,"ENAME": "MILLER","JOB": "CLERK","MGR": 7782,"HIREDATE": "1982-01-23 00:00:00","SAL": 1300.00,"COMM": null,"DEPTNO": 10}
0: jdbc:hive2://CentOS:10000> add jar /usr/apache-hive-1.2.2-bin/hcatalog/share/hcatalog/hive-hcatalog-core-1.2.2.jar;
CREATE EXTERNAL TABLE t_emp_json(
    empno INT,
    ename STRING,
    job STRING,
    mgr INT,
    hiredate TIMESTAMP,
    sal DECIMAL(7,2),
    comm DECIMAL(7,2),
    deptno INT)
    ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';
0: jdbc:hive2://CentOS:10000> load data local  inpath '/root/baizhi/t_emp_json' overwrite into table t_emp_json;
    
0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno from t_emp_json;
CREATE TABLE t_employee(
    empno INT,
    ename STRING,
    job STRING,
    mgr INT,
    hiredate TIMESTAMP,
    sal DECIMAL(7,2),
    comm DECIMAL(7,2)
    )
    PARTITIONED BY (deptno INT)   
    ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee PARTITION (deptno=10) select empno,ename,job,mgr,hiredate,sal,comm from t_emp_json where deptno=10;
0: jdbc:hive2://CentOS:10000> INSERT INTO TABLE t_employee PARTITION (deptno=10) select empno,ename,job,mgr,hiredate,sal,comm from t_emp_json where deptno=10;

更常见的做法0: jdbc:hive2://CentOS:10000> create TEMPORARY table t_tmp as select empno,ename,job,mgr,hiredate,sal,comm,deptno from t_emp_json where deptno=10;

将查询结果插入多个表

FROM from_statement
INSERT OVERWRITE TABLE tablename1 
[PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
from t_emp_json 
INSERT OVERWRITE TABLE t_employee partition(deptno=10) select empno,ename,job,mgr,hiredate,sal,comm  where deptno=10 
INSERT OVERWRITE TABLE t_employee partition(deptno=20) select empno,ename,job,mgr,hiredate,sal,comm  where deptno=20 
INSERT OVERWRITE TABLE t_employee partition(deptno=30) select empno,ename,job,mgr,hiredate,sal,comm  where deptno=30

插入动态分区

0: jdbc:hive2://CentOS:10000> set hive.exec.dynamic.partition.mode=nonstrict; #开启动态分区
No rows affected (0.016 seconds)
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee PARTITION (deptno)  SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_emp_json ;
结果写出到文件系统
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
  [ROW FORMAT row_format] [STORED AS file_format] 
  SELECT ... FROM ...
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE DIRECTORY 'D:\桌面文件\img'  row format delimited fields terminated by ',' STORED AS textfile SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值