hive的语句
- 查看表在hdfs上的目录
show create table 表名
- 设置reduce个数
set mapred.reduce.tasks=4; - 创建内部表并指定分隔符
create table person(name STRING,age INT)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
- 映射文件(内部表)
- 将文件copy到hdfs下对应表下的文件夹
- 从本地文件映射,并且load会将文件自动保存到hdfs上的/user/hive/warehouse中
- load data hdfs上的数据时,会将数据剪切到hive表所在文件中,无论内部表还是外部表。
LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/person.txt' OVERWRITE INTO TABLE person;
- 映射文件(外部表创建及映射)
create external table fz_external_table(id int,name string,age int,tel string)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY ','
> STORED AS TEXTFILE
> location '/user/hive/external/fz_external_table';
- 必须是目录,不能是文件(在hdfs上)使用location的话,在hive的数据仓库看不到,不使用location的话,会保存到hive的数据仓库上(删除表数据不丢失);
分区表
- 创建
- partition 必须放在row format前面
create external table demo1(id int,name string,age int)partitioned by (month string,day string)row format delimited fields terminated by ',';
- 导入数据增加分区
分区表导入数据要是用load data
load data local inpath '/opt/aa' into table demo1 partition(month='201808',day='02');
- 只创建分区
alter table demo1 add partition(month=’201808’,day=’03’); - 删除分区
alter table demo1 drop partition(month='201808',day='03');
- 修改分区
alter table demo1 partition()
- 查询数据
修复元数据
msck repair table dept_part
修改分区
alter table demo1 partition(month='201808',day='02')rename to partition (month='201888',day='03');
查询数据
select * from demo1 where month='201808' and day = '2';
- 显示一张表有几个分区
show partitions demo1
数据迁移
- 加载本地文件到Hive表
load data local inpath ‘path/file’ into table 表名称 ; - 加载HDFS文件到Hive表
load data inpath ‘path/file’ into table 表名称 ; - 加载数据覆盖表中已有的数据
load data local inpath ‘path/file’ overwrite into table 表名称 ; - 创建表时通过select加载
create table db_0831.emp_as as select * from emp ; - 创建表的时候通过location指定加载
- 用insert命令加载
insert into table 表名 select * from emp;
insert overwrite table 表名 select * from emp;
create table as select
限制
- 目标表不能是分区表
- 目标表不能是外部表
- 目标表不能是list bucketing
数据导出
- 通过insert ..directory导出
insert overwrite local directory "/opt/mydata/exp" row format delimited fields terminated by '\t' select * from emp ;
- 导出到hdfs
insert overwrite directory “hdfspath/" select * from emp ;
查询语句
- 添加字段
hive (test)> alter table t_t2 add columns(n int);
- 查看分区信息
show partitions demo1; - 修改表名
ALTER TABLE oldtablename RENAME TO newtablename; - 只查前两行
select * from student limit 2 ; - 统计一个表的行数
select count(*) from student ; - 查询最大值
select max(id) from t-t2; - 查询最小值
select min(id) from t-t2; - 查询平均值
select avg(id) from t-t2; - 查询总值
select sum(id) from t-t2; 内连接查询
select * from t_t1 a join t_t2 b on a.id=b.id;//a和b是别名
select a.name,b.name from t_t1 a join t_t2 b on a.id=b.id;
sort by
- 降序
select * from emp sort by sal desc;
order by
- 按照id排序
select * from person order by id; - 先按照id排序后按照name排序
select * from person order by id,name; - 降序排列
select * from person order by id desc; - 升序排列
select * from person order by id asc; - id升序,age降序
select * from person order by id asc,age desc;
group by
- 以id分组,并且求sum
select id,sum(age) from person order by id; - 对多行进行分组
select id,age from person group by id,age;
distribute by
- 以id分开进行reduce,以id排序
select * from person distribute by id sort by id; - 以id分开进行reduce,以age排序
select * from person distribute by id sort by age;
cluster by
- 类似于mapreduce中的分区
select * from person cluster by id;
和select * from person distribute by id sort by id;
效果相同
使用distribute和sort进行分组排序
select * from person distribute by id sort by age;
复制表结构
create table demo1 like demo2;
连表查询
修改表
- 修改字段名a
alter table demo change a a1 int; - 重命名
alter table demo rename to new name;
显示命令
- 显示分区表信息
show partitions table_name; - 显示当前版本hive支持的所有方法
show functions; - 查看表信息
desc extends table_name; - 查看表信息(格式化美观)
desc formatted table_name; - 查看数据库相关信息
describe database database_name;
视图与索引
- 创建视图
create view view_name as select * from emp where sal > 2000; - 查询视图
select * from view_name - 删除视图
drop view view_name; - 创建索引
Hive>create index index_test on table emp (empno) as ‘org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler’ with deferred rebuild ;—增加数据重建索引 - 重构索引
alter index index_test on emp rebuild; - 删除索引
drop index index_test on student;
hive常用函数
- 查看有那些函数
show functions; - rand函数,产生0到1的随机数
select rand() - concat函数(拼接字符串)
select concat(id,name)from person;
select concat(id,”_”,name)from person; - substr函数(截取字符串)
select substr(name,1,2)from person; - cast类型转换
select cast(1.22 as int); - 查看时间
select hour(‘2018-9-8 13:21:23’);
select day(‘2018-9-8 13:21:23’);
hiveUDF函数
- 需要继承org.apache.hadoop.hive.ql.hiveUDF函数
- 需要实现evaluate函数,evaluate函数支持重载
- 配置文件出错,强制刷新
maven-右键maven-uodate project-勾选force update of snapshots/releases; - 配置文件
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>0.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>0.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.5.0</version>
</dependency>
- 转大小写实例代码
package weice.gao.hive;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class hiveDemo1 extends UDF{
public Text evaluate(Text s){
if(s==null){return null;}
return new Text(s.toString().toLowerCase());
}
// public static void main(String[] args) {
// hiveDemo1 demo1 = new hiveDemo1();
// System.out.println(demo1.evaluate(new Text("aaaAAA")));
// }
}
- 打jar包运行
- 把jar包加到hive上
add jar /opt/bb.jar; - 创建临时函数
create temporary function demo as ‘weice.gao.hive.hiveDemo1’;
#weice.gao.hive.hiveDemo1表示class在jar包中的位置,demo是函数名 - 运行
select name,demo(name) from person; - 删除jar包
drop temporary function demo;