一、常用操作
1.数据库操作
创建数据库:create database if not exists db_name [location];
删除数据库:drop database if exists db_name ;
使用数据库:use db_name;
描述数据库(不包含扩展信息):desc database db_name;
描述数据库(包含扩展信息):desc database extended db_name;
强制删除数据库:drop database db_name cascade;
修改数据库默认位置:hive>create database db_name
location '路径';
2.表操作
创建表:
普通的创建:
(不带注释)create table if not exists tbname( clo type ……)
row format delimited fields terminated by '\t'
stored as textfile
location ''
(带注释)create table if not exists tbname( clo type comment '描述信息')
(外部表)create external table if not exists tbname( clo type ……)
子查询创建:(将子查询的数据和结构作为新的表)
create table if not exists stu_as as select id from test;
like创建:(仅仅复制test表的结构)
create table if not exists stu_like like test;
删除表:drop table tb_name;
清空表:TRUNCATE TABLE table_name;(不支持外部表)
查询表:select * from tb_name;
在当前数据库查看另一数据库的表:show tables in db_name;
查询表结构:desc table_name;
desc formatted table_name;
查看分区信息:show partitions table_name;
修改表名:alter table table_name rename to table_name;
显示扩展信息:desc extended tb_name;
显示格式化信息:desc formatted tb_name;
显示字段名称:set hive.cli.print.header=true;(默认是关闭状态)
修改列信息:alert table tb_name
change column hms hours_minutes_seconds int
comment '...'
after aaa;
(将字段名hms改为hours_minutes_seconds,并修改类型,修改描述,并移动到aaa字段后)
增加列:alert table tb_name add column(
name string comment'');
删除或替换列:alert table tb_name replace column(
name string comment'');
(只能用于使用了DynamicSerDe和MetadataTypedCulomensetSerDe两种SerDe模块的表)
修改表注释: alert table table_name set TBLPROPERTIES('comment' = new_comment);
3.hive常用命令
hive中执行shell命令:
!clear;
hive中执行hdfs命令:
dfs -ls /user;
二、分区表
1.手动创建分区表:
create table emp1_part like db_0827.emp_part;
2.手动添加数据:
dfs -put /opt/testfile/emp.txt /user/hive/warehouse/test.db/emp1/;
3.申明手动创建的分区
修复表:msck repair table tb_name;
添加分区:alter table tb_name add partition(time=‘0829');
删除分区:alter table tb_name drop if exists partition(time='time%3D0829');
4.设置分区表的查询模式
严格模式:set hive.mapred.mode=strict;
非严格模式:set hive.mapred.mode=nostrict;
严格模式下,若在查询时,where没有加分区过滤的话,将会禁止提交任务(默认情况下是非严格模式)
5.动态分区属性
set hive.exec.dynamic.partition=true;开启动态分区
set hive.exec.dynamic.partition.mode=nostrict;允许所有分区都是动态的
set hive.exec.max.dynamic.partitions.pernode=100;每个mapper或reducer可创建的最大动态分区数
set hive.exec.max.created.files=1000000;全局可创建的最大文件个数
三、hive中导入数据的方式
1.加载本地文件到hive
load data local inpath '/opt/testfile/emp.txt' into table emp2;
2.加载hdfs文件到hive
load data inpath '/log/emp.txt' into table emp2;
3.覆盖表中的数据
load data local inpath '/opt/testfile/emp.txt'
overwrite into table emp2;
load data local inpath '/opt/testfile/emp.txt'
overwrite into table emp2
partition(year='2017');(分区目录不存在会自动创建目录,然后再将数据拷贝到该目录下)
4.子查询方式创建表时
create table if not exists tbname as select ……
5.insert方式
create table emp3 like emp;
insert overwrite table emp3 select * from emp;
6.location 加载
create table emp_loc (col ……)
row format……
location 'hdfs_path';
7.单个查询数据创建表并插入数据
create table tb_name1 as
select name,age from tb_name2
where age>18;
(常用于从一个大的宽表中选取部分需要的数据)
四、hive数据导出的几种方式
1.导出到本地目录
insert overwrite local directory '/opt/testfile/emp' select * from emp;
指定分隔符:
insert overwrite local directory '/opt/testfile/emp' row format delimited fields terminated by '\t' select * from emp;
2.导出到hdfs路径
insert overwrite directory '/log/emp' select * from emp;
注:不能指定分隔符
3.通过hive shell命令来保存
bin/hive -e 'show tables;' >/opt/testfile/test
bin/hive -f /opt/testfile/test.sql >/opt/testfile/test
4.sqoop:以hdfs为中心
hdfs - > mysql
hive - > mysql
mysql -> hdfs
5.hive的表数据-》hdfs的文件
dfs -get /log/emp/* /opt/testfile/;
五、hive中的export import
export:
EXPORT TABLE tablename [PARTITION (part_column="value"[, ...])]
TO 'export_target_path' [ FOR replication('eventid') ]
export table emp to '/log/emp_exp';
导出了元数据和数据文件
import :
IMPORT [[EXTERNAL] TABLE new_or_original_tablename [PARTITION (part_column="value"[, ...])]]
FROM 'source_path'
[LOCATION 'import_target_path']
import table emp4 from '/log/emp_exp';
注:路径只能是hdfs
六、hive种常用的语句
1.查询字段
select empno ,ename ,deptno ,sal from emp;
2.where 、limit 、distinct
select empno ,ename ,deptno ,sal from emp where deptno = '10';
select empno ,ename ,deptno ,sal from emp limit 3;
select distinct deptno from emp;
3.> 、<、 =、 between and 、in 、not in 、 is null、 is not null
select empno ,ename ,deptno ,sal from emp where sal > 3000;
select empno ,ename ,deptno ,sal from emp where sal between 0 and 3000;
select empno ,ename ,deptno ,comm from emp where comm is not null;
4.show functions;聚合函数:count()/sum()/max()/min()/avg()
select avg(sal) as avg_sal from emp;
5.group by/having
select deptno ,avg(sal) from emp group by deptno;
select deptno ,job ,avg(sal) from emp group by deptno,job ;
6.join
等值连接:
select e.empno ,e.ename ,d.deptno ,d.dname from emp e join dept d on e.deptno=d.deptno;
左连接:以左表为准
select e.empno ,e.ename ,d.deptno ,d.dname from emp e left join dept d on e.deptno=d.deptno;
右连接:以右表为准
select e.empno ,e.ename ,d.deptno ,d.dname from emp e right join dept d on e.deptno=d.deptno;
全连接:
select e.empno ,e.ename ,d.deptno ,d.dname from emp e full join dept d on e.deptno=d.deptno;
7.三种特殊排序
sort by:对每一个reduce处理内容进行排序
select empno ,ename ,deptno ,sal from emp sort by ;
distribute by: 类似于分区,决定交给某一个reduce进行处理,一般都与sort by 进行连用
select empno ,ename ,deptno ,sal from emp distribute by deptno sort by empno desc;
cluster by:一般用于sort by 和 distribute by 的字段相同
select empno ,ename ,deptno ,sal from emp cluster by empno;
8.hive查询top10
select empno ,ename ,deptno ,sal from emp sort by sal limit 10;
9.hive 数据类型转换
string转float:cast(string AS float)
string转int:cast(string AS int)
string转date;cast(string AS date)
示例:
SELECT name, salary FROM employees WHERE cast(salary AS FLOAT) < 100000.0;