1.Hive中数据库的基本操作
1.1 Create/Drop
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
[COMMENT database_comment]
[LOCATION hdfs_path]
[WITH DBPROPERTIES (property_name=property_value, ...)];
- create database
create database db_hive_01;
## 标准
create database if not exists db_hive_02;
##创建数据库并制定存放目录
create database if not exists db_hive_03
location '/user/weblog/hive/warehouse/db_hive_03.db';
- 查看数据库
show databases;
## 查看有几个数据库
show databases like 'db_hive*';
desc database db_hive_01;
- 删除数据库
drop database db_hive_01;
## 数据库中有表用cascade级联删除
drop database db_hive_01 cascade;
1.2 create table
官方建表语法:
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later)
[(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)]
ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
[STORED AS DIRECTORIES]
[
[ROW FORMAT row_format]
[STORED AS file_format]
| STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later)
]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later)
[AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];
data_type
: primitive_type
| array_type
| map_type
| struct_type
| union_type -- (Note: Available in Hive 0.7.0 and later)
primitive_type
: TINYINT
| SMALLINT
| INT
| BIGINT
| BOOLEAN
| FLOAT
| DOUBLE
| DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
| STRING
| BINARY -- (Note: Available in Hive 0.8.0 and later)
| TIMESTAMP -- (Note: Available in Hive 0.8.0 and later)
| DECIMAL -- (Note: Available in Hive 0.11.0 and later)
| DECIMAL(precision, scale) -- (Note: Available in Hive 0.13.0 and later)
| DATE -- (Note: Available in Hive 0.12.0 and later)
| VARCHAR -- (Note: Available in Hive 0.12.0 and later)
| CHAR -- (Note: Available in Hive 0.13.0 and later)
array_type
: ARRAY < data_type >
map_type
: MAP < primitive_type, data_type >
struct_type
: STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type
: UNIONTYPE < data_type, data_type, ... > -- (Note: Available in Hive 0.7.0 and later)
row_format
: DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
[NULL DEFINED AS char] -- (Note: Available in Hive 0.13 and later)
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
file_format:
: SEQUENCEFILE
| TEXTFILE -- (Default, depending on hive.default.fileformat configuration)
| RCFILE -- (Note: Available in Hive 0.6.0 and later)
| ORC -- (Note: Available in Hive 0.11.0 and later)
| PARQUET -- (Note: Available in Hive 0.13.0 and later)
| AVRO -- (Note: Available in Hive 0.14.0 and later)
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
constraint_specification:
: [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE ]
[, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE
- 内部表的创建
##创建表有三种方法:
create table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)
row format delimited fields TERMINATED by '\001'
stored as textfile;
-- 或者使用select创建
##该方式及复制表结构又加载数据
##注意:该方法创建的表无分区,select的表中的分区字段会被加载到目标表中
create table if not exists default.student_tmp
select sno,sname,sex,sage from default.student;
## 或者使用like创建表
create table if not exists default.student_like
like default.student;
- 外部表的创建
- 数据文件存储在指定的hdfs目录下,删除表数据不会从文件系统中删除
create external table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)
row format delimited fields TERMINATED by '\001'
location 'hdfs_location';
- 创建分区表
- 分区表实际上就是对应一个HDFS文件系统上的独立的文件夹,该文件夹下是该分区所有的数据文件。Hive中的分区就是分目录,把一个大的数据集根据业务需要分割成更小的数据集。
- 在查询是通过where子句中的表达式来选择查询所需要的指定的分区,这样的查询效率会提高很多。
create external table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)partitioned by(prov_id string)
row format delimited fields TERMINATED by '\001'
location 'hdfs_location';
- 分桶排序表
create external table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)partitioned by(prov_id string)
clustered by(sno) sorted by(sage) into 4 buckets
row format delimited fields TERMINATED by '\001'
location 'hdfs_location';
2. hive数据加载或导出
2.1 hive表加载数据
测试数据:
emp表
7369 SMITH CLERK 7902 1980-12-17 800 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600 300 30
7521 WARD SALESMAN 7698 1981-2-22 1250 500 30
7566 JONES MANAGER 7839 1981-4-2 2975 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250 1400 30
7698 BLAKE MANAGER 7839 1981-5-1 2850 30
7782 CLARK MANAGER 7839 1981-6-9 2450 10
7788 SCOTT ANALYST 7566 1987-7-13 3000 20
7839 KING PRESIDENT 1981-11-17 5000 10
7844 TURNER SALESMAN 7698 1981-9-8 1500 0 30
7876 ADAMS CLERK 7788 1987-7-13 1100 20
7900 JAMES CLERK 7698 1981-12-3 950 30
7902 FORD ANALYST 7566 1981-12-3 3000 20
7934 MILLER CLERK 7782 1982-1-23 1300 10
LOAD DATA [LOCAL] INPATH 'filepath'
[OVERWRITE] INTO TABLE tablename
[PARTITION (partcol1=val1, partcol2=val2 ...)]
- load data [local] inpath ‘filepath’ [overwrite] into table tablename partition (partcol1=val1,…);
- 原始文件的存储位置:[local]
- 本地 local
- hdfs
- 对表的数据是否覆盖:[overwrite]
- 覆盖 overwrite
- 追加
分区表加载,特殊性
- partition (partcol1=val1,…)
加载文件到hive表:
## 加载本地文件到hive表
load data local inpath '/home/hadoop/hivedata/emp.txt' into table default.emp;
## 加载HDFS文件到Hive
load data inpath '/user/weblog/datas/emp.txt' into table default.emp;
## 加载数据并覆盖表中已有的数据
load data inpath '/user/weblog/datas/emp.txt' overwrite into table default.emp;
##通过inster加载
insert [overwrite] into table default.emp select * from default.emp
##注意:以上方法在内部表中使用,外部表一般在创建表的时候就在location中指定好了数据路径;
2.2 hive表导出数据
##导出数据到本地目录
insert overwrite local directory '/home/hadoop/hive_emp'
select * from default.emp;
##导出数据到本地目录,并按指定分隔符分隔字段
insert overwrite local directory '/home/hadoop/hive_emp'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' COLLECTION ITEMS TERMINATED BY '\n'
select * from default.emp;
##查询结果重定向到指定文件
hive -e 'select * from default.emp;' > /home/hadoop/hive_emp/emp.txt
3. Hive中的查询语句
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY col_list]
]
[LIMIT [offset,] rows]
- max、min、count、sum、avg
select count(*) cnt from emp;
select max(sal) max_sal from emp;
select min(sal) min_sal from emp;
select sum(sal) from emp;
select avg(sal) from emp;
- group by having
##分组:
##每个部门的平均工资
select deptno,avg(sal) avg_sal from emp group by deptno ;
##每个部门中每个岗位的最高薪水
select deptno,job,max(sal) max_sal from emp group by deptno,job;
##每个部门的平均薪资大于2000的部门
select deptno,avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
4. hive查看数据库及表信息
- 查看数据库
show databases;
- 查看某个数据库中所有的表
use database;
show tables;
- 查看表结构
##查看表字段信息
desc tablename;
##查看表详细信息
desc formatted tablename;
##查看表的建表语句
show create table tablename;
5. Order, Sort, Cluster, and Distribute By区别
- order by 全局数据的排序,仅仅只有一个reduce
select * from emp order by empno desc;
- sort by 对每个reduce内部数据进行排序,对全局的结果集不是排序的
##实现reduce内部排序先要设置reduce task个数
set mapreduce.job.reduces=<number>
select * from emp sort by empno asc;
##将结果集写入到本地文件中查看效果
insert overwrite local directory '/home/hadoop/hivedata/sortby'
select * from emp sort by empno asc;
- distribute by
- 类似于MapReduce中分区partition,对数据进行分区,结合soat by 进行使用
insert overwrite local directory '/home/hadoop/hivedata/distby'
select * from emp distribute by deptno sort by empno asc;
##注意事项:distribute by 必须要在 sort by 前面
- cluster by 当distribute by和sort by字段相同时,可以使用cluster by;
insert overwrite local directory '/home/hadoop/hivedata/clusterby'
select * from emp cluster by empno;