hive 权威
1、创建数据库
create database if not exists financials
comment ‘hold all financial tables’
with dbproperties(‘creator’=‘wuwc’,‘date’=‘2019-12-09’);
2、设置变量和属性
set hive.cli.print.current.db=true;
3、删除数据库
drop database if exists financials cascade;
4、修改数据库
alter database financials set dbproperties(‘updated’=‘greekw’)
5、创建表
- 内部表
create table if not exists employees(
name STRING,
salary FLOAT,
subordinates ARRAY,
deductions MAP<STRING,FLOAT>
)
row format delimited
fields terminated by ‘\001’
collection items terminated by ‘\002’
map keys terminated by ‘\003’
line terminated by ‘\n’
comment ‘员工表’
tblproperties(‘creator’=‘wuwc’,‘date’=‘2019-12-09’)
location ‘/user/hive/warehouse/financials.db/employees’; - like创建表
create table if not exists employees02
like financials.employees;
create external table if not exists financials.stocks
(
stock_code STRING,
symbol STRING,
price_open FLOAT,
price_high FLOAT,
price_low FLOAT,
price_close FLOAT
)
row format delimited fields terminated by ‘,’
location ‘/data/stocks’;
- 外部表
6、查看表
describe formatted employees;
7、分区表和管理表
-
内部分区表
create table if not exists employees_part(
name STRING,
salary FLOAT,
subordinates ARRAY,
deductions MAP<STRING,FLOAT>
)
partitioned by (country string,state string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’ ; -
外部分区表
create external table if not exists log_msg(
hms int,
serverity string,
server string,
processId int,
msg string
)
partitioned by (year int,month int,day int)
row format delimited fields terminated by ‘\t’;
修改表
1、表重命名
alter table log_msg rename to log_msg_new
2、增删改表的分区
alter table log_msg add if not exists
partition by(year=2019,month=12) location ‘/logs/2019/12’;
alter table log_msg drop if exists partition(year=2019,month=12);
3、增删改列
alter table log_msg add columns(
app_name string comment ‘应用名称’
)
4、修改表
-
修改表属性
- alter table table_name set tblproperties(‘key’=‘value’);
-
修改表其他属性
-
alter table … achive/unachive/touch partition; 分区操作
-
alter table table_name partition(year=2019,month=12) enable no_drop; 开启分区保护
数据操作
create table t10(
id int
,name string
,hobby array
,add map<String,string>
)
partitioned by (pt_d string,sex string)
row format delimited
fields terminated by ‘,’
collection items terminated by ‘-’
map keys terminated by ‘:’
;
create table t1(
id int
,name string
,hobby array
,add map<String,string>
)
partitioned by (pt_d string)
row format delimited
fields terminated by ‘,’
collection items terminated by ‘-’
map keys terminated by ‘:’
;
装载数据
从本地文件系统导入数据
从HDFS中导入数据
从其他的Hive表中导入数据
创建表的同时导入数据
- load data local inpath ‘本地路径’ overwrite into table table_name partition();
正确操作:
load data local inpath ‘/data/hive’ overwrite into table log_msg partition(year=2019,month=12,day=10);
insert overwrite table log_msg partition(year=2019,month=12,day=11) select * from log_msg;
Need to specify partition columns because the destination table is partitioned
Partition not found ‘12’
-
load data inpath ‘分布式文件路径’ overwrite into table table_name partition();
-
insert overwrite table table_name partition() select *from table_name;
-
动态分区和静态分区,静态分区必须在动态分区前;
- hive.exec.dynamic.partition=true:开启动态分区; - hive.exec.dynamic.partition.mode=nostrict: 容许所有分区都是动态; - hive.exec.max.dynamic.partitions.pernode=100:每个mapper或reduce创建的最大动态分区个数; - hive.exec.max.dynamic.partitions=1000:最大可创建动态分区个数; - hive.exec.max.created.files=1000:全局可以创建的最大文件个数;
from stu
insert into table student01 partition(city=‘beijing’) select id, sname, age where city=‘beijing’
insert into table student02 partition(city=‘shanghai’) select id, sname, age where city=‘shanghai’;
- 创建表
create external table if not exists student(
id string,
sname string,
age int
)
row format delimited fields terminated by ‘,’;
create table if not exists student_hdfs(id string,sname string,age int)
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’;
create table if not exists student_insert(id string,sname string,age int)
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’;
-
从本地加载数据
load data local inpath ‘/data/hive/student.txt’ overwrite into table student; -
从hdfs
load data inpath ‘/user/hdfs/student.txt’ overwrite into table student_hdfs;
-
错误:
Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask -
insert 方式
insert overwrite table student_insert select * from student; -
from insert方式
from student insert into table student_insert select id,sname,age where age>28; -
创建分区表
create external table if not exists student_part(id string,sname string,age int)
partitioned by (city string)
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’;
create external table if not exists student__double_part(id string,sname string,age int)
partitioned by (country string,city string)
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’;
- 添加分区
ALTER TABLE student_part ADD if not exists PARTITION(city=‘beijing’) location ‘分区位置’
FAILED:SemanticException table is not partitioned but partition spec exists:{address=china}
在新建表的时候,并没有创建分区列address,所以只有在存在分区列的表上执行增加分区的操作,才会成功。
- 加载分区数据
ALTER TABLE student_part ADD if not exists PARTITION(city=‘shanghai’) - load data local inpath ‘/data/hive/student.txt’ overwrite into table student_part partition(city=‘beijing’);
- insert 方式
from student
insert into table student_part partition(city=‘guangdong’) select id,sname,age
insert into table student_part partition(city=‘chongqing’) select id,sname,age;
from student
insert into table student_double_part partition(country='china,'city=‘chongqing’) select id,sname,age;
-
修复分区
dfs -mkdir /user/hive/warehouse/financials.db/student__double_part/country=japan/city=tokoy;
- msck repair table table_name;
-
查看分区
- show partitions table_name;
-
动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
from student_part
insert into table student_part partition(city) select id, sname, age, city
insert into table student_part partition(city) select id, sname, age, city ;
-
静态和动态混合的
-
创建表的同时导入数据
- create table student_part_create as select * from student_part;
-
分桶:分区和分桶都是按字段来组织数据的存放,
分区是相同的字段值存放在一个文件中,而分桶是字段哈希值相同的数据存放在一个文件中。
create table student_cluster( id string, sname string, age int)
clustered by(id) into 2 buckets
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’; -
数据导出
-
导出到本地
insert overwrite directory ‘’ select * from student where age>28;
insert overwrite local directory ‘/data/hive/student_export.txt’ select * from student where age>28;
- 导出到hdfs
http://blog.leanote.com/post/yongjian3311@163.com/Hive%E4%BF%AE%E6%94%B9%E8%A1%A8%E7%BB%93%E6%9E%84
####hive查询
case …when … then
select id,sname,age,
case
when age>30 then ‘old’
when age>30 and age<20 then ‘mid’
else ‘young’
end as level from student;
-
hive join查询
-
left/right/full/笛卡尔积/
-
关闭笛卡尔积:set hive.mapred.mode=strict
-
map-side join查询
-
-
order by /sort by等
-
set hive.mapred.mode=strict 则使用order by 需要加上limit
-
order by 全局排序,sort by 局部排序
-
distribute by 和group by
-
-
类型转换函数cast(field as int ),round(),floor()
索引
模式
-
分区方案
-
唯一键和标准化
-
每个表的分区
调优
-
explain/explain externed 使用
-
limit限制调整
-
join 优化
-
本地模式
-
并行执行
-
严格模式
-
调整mapper和reducer个数
-
JVM重用
-
动态分区数调整
-
单个mr中多个group by
文件格式和压缩方法
-
查看编解码器
-
设置中间压缩
set hive.exec.compress.intermediate=true -
设置输出压缩
set hive.exec.compress.output=true -
sequence file 格式
支持NONE,RECORD,BLOCK等
hive函数
自定义hive文件格式
-
设置表的文件格式:stored as 文件格式;
-
文件格式
-
sequencefile
-
rcfile
-
-
记录格式
-
serDe
-
CSV 和TSV
-
XML UDF
-
xpth
-
json serDe
-
hive thrift服务
-
hive server或hive thrift是一个基于thrift协议的组件,可以基于java,c++,python等语言远程访问;
-
启动hive server
- bin/hive --service hiveserver &
-
管理hiveserver
- httpproxy负载hiveserver
-
hive metastore
-
启动metastore
-
bin/hive --service metastore &
-
hive 连接外部存储
- hive连接hbase
方案一 建立 Hive 表,关联 HBase 表,插入数据到 Hive 表的同时能够影响 HBase 表
- 创建连接外部表
create table hive_hbase_emp_table(
empno int,
ename string comment ‘姓名’,
job string,
mgr int,
hiredate string,
sal double,
deptno int
)
stored by ‘org.apache.hadoop.hive.hbase.HBaseStorageHandler’
with serdeproperties(“hbase.columns.mapping”=":key,info:ename,info:job,info:mgr,info:hiredate,info:sal,info:deptno")
TBLPROPERTIES (“hbase.table.name” = “hbase_emp_table”);
CREATE TABLE emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int)
row format delimited fields terminated by ‘,’;
load data local inpath ‘/data/hive/emp.txt’ into table emp;
insert into table hive_hbase_emp_table select * from emp;
SemanticException [Error 10101]: A non-native table cannot be used as target for LOAD
方案二:
在 HBase 中已经存储了某一张表 hbase_emp_table,
然后在 Hive 中创建一个外部表来 关联 HBase 中的 hbase_emp_table 这张表,使之可以借助 Hive 来分析 HBase 这张表中的数 据
create external table hive_crs_data_report(
rowkey string comment ‘查询主键’,
params string comment ‘请求参数’,
timestamp string comment ‘时间戳’,
dataClass string comment ‘类型’,
data string comment ‘数据’
)
stored by ‘org.apache.hadoop.hive.hbase.HBaseStorageHandler’
with serdeproperties(“hbase.columns.mapping”=":key,cf1:params,cf1:timestamp,cf1:dataClass,cf1:data")
TBLPROPERTIES (“hbase.table.name” = “crs_data_report_test_able”);
hive权限管理和锁
-
hive0.10后版本权限控制是基于元数据控制,之前版本是通过linux的用户组合用户控制
-
使用Hive的元数据配置权限之前必须现在hive-site.xml中配置两个参数
-
hive.security.authorization.enabled:开启权限验证,默认为false
-
hive.security.authorization.createtable.owner.grants:参数是指表的创建者对表拥有所有权限
-
hive.security.authorization.enabled参数是开启权限验证,默认为false。
hive.security.authorization.createtable.owner.grants参数是指表的创建者对表拥有所有权限
-