hive cli
hive执行日志的位置
日志配置文件:/conf/hive-log4j2.properties
property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
可以在hive cli中查询变量值
hive> set system:java.io.tmpdir;
system:java.io.tmpdir=/tmp
--root用户的默认日志位置
/tmp/root/hive.log
指定主动使用本地模式

指定数据仓库目录

hive查看和使用自定义及系统属性
--会打印所有属性
hive>set;
hive>set env:HOME;
hive> set hivevar:col_name=name;
hive> set col_name;
hive> create table test3(id int,${env:LOGNAME} string);
hive> create table test2(id int,${hivevar:col_name} string);
--启动是添加配置项
--显示当前db
hive --hiveconf hive.cli.print.current.db=true
hive (default)> set hiveconf: hive.cli.print.current.db;
hiveconf: hive.cli.print.current.db=true
--更改为不显示
hive (default)> set hiveconf: hive.cli.print.current.db=false;
hive>
--显示系统属性,java对system属性有可读可写权限
hive> set system:user.name;
system:user.name=root
hive> set system:myname=hujiawei;
hive> set system:myname;
system:myname=hujiawei
#执行一条命令,-S 是静默模式,不会输出OK之类的信息
hive -S -e 'show tables;'
hive -S -e 'set'|grep warehouse;
#创建src表,并加载数据
create table src(s String);
echo "one row">/tmp/myfile
hive -e "LOAD DATA LOCAL INPATH '/tmp/myfile' into table src"
#执行sql文件
hive -f test.hql
#hive cli中执行文件
source /root/test/test.hql;
#hive cli加载时候自动加载$HOME/.hiverc文件,如果没有,可以创建一个
set hive.cli.print.current.db=true;
set hive.exec.mode.local.auto=true;
#hive -i 会在启动时候加载指定的文件
hive所有默认属性都在
/opt/install/hadoop/apache-hive-2.3.6-bin/conf/hive-default.xml.template
文件中显示配置了
-
在hive中执行shell命令:!pwd,执行hadoop命令:dfs -ls /
-
hive脚本注释方式同sql: –
数据类型


create table employes(
name string,
salary float,
subordinates array<string>,
deductions map<String,float>,
address struct<street:string,city:string,state:string,zip:INT>
);
默认字段分隔符

create table employes(
name string,
salary float,
subordinates array<string>,
deductions map<String,float>,
address struct<street:string,city:string,state:string,zip:INT>
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003'
lines terminated by '\n'
stored as textfile;
sql
database
show datbase like 't*'
create database test;
drop datbase test;
desc database test;
--hive中一个database在hdfs中以.db结尾的目录,表是以表名为名的目录,可以在创建的时候指定位置
hive> create database test location '/test/test.db';
dfs -ls -R /test
--删除有表的database;
drop database test cascade;
--添加database描述
create database test comment 'this is a test db';
--添加database属性
create database test comment 'this is a test db' with dbproperties('creator'='hujiawie','date'='2019年12月10日');
--查看属性
desc database extended test;
--修改数据库属性,可以新增,不能删除
alter database test set dbproperties('creators'='laohu');
table
create table employes(
name string,
salary float,
subordinates array<string>,
deductions map<String,float>,
address struct<street:string,city:string,state:string,zip:INT>
)location '/test/employes'
--拷贝表
hive> create table employees like employes;
hive> show tables in mydb;
指定分隔符
hive> create table t1(
> id int
> ,name string
> ,hobby array<string>
> ,add map<String,string>
> )
> partitioned by (pt_d string)
> row format delimited
> fields terminated by ','
> collection items terminated by '-'
> map keys terminated by ':'
> ;
分区表
hive> create table t1(
> id int
> ,name string
> ,hobby array<string>
> ,add map<String,string>
> )
> partitioned by (pt_d string)
> row format delimited
> fields terminated by ','
> collection items terminated by '-'
> map keys terminated by ':'
> ;
--加载数据
1,xiaoming,book-TV-code,beijing:chaoyang-shagnhai:pudong
2,lilei,book-code,nanjing:jiangning-taiwan:taibei
3,lihua,music-book,heilongjiang:haerbin
load data local inpath '/root/test/myfile' overwrite into table t1 partition ( pt_d = '201701');
--加载另一个分区数据
1 xiaoming ["book","TV","code"] {
"beijing":"chaoyang","shagnhai":"pudong"} 000000
2 lilei ["book","code"] {
"nanjing":"jiangning","taiwan":"taibei"} 000000
3 lihua ["music","book"] {
"heilongjiang":"haerbin"} 000000
1 xiaoming ["book","TV","code"] {
"beijing":"chaoyang","shagnhai":"pudong"} 201701
2 lilei ["book","code"] {
"nanjing":"jiangning","taiwan":"taibei"} 201701
3 lihua ["music","book"] {
"heilongjiang":"haerbin"} 201701
load data local inpath '/root/test/myfile2' overwrite into table t1 partition ( pt_d = '000000');
--查看dfs上的目录和文件
hive> dfs -ls -R /user/hive/warehouse/mydb.db;
drwxr-xr-x - root supergroup 0 2019-12-11 10:56 /user/hive/warehouse/mydb.db/employes
drwxr-xr-x - root supergroup 0 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1
drwxr-xr-x - root supergroup 0 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1/pt_d=000000
-rwxr-xr-x 1 root supergroup 474 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1/pt_d=000000/myfile2
drwxr-xr-x - root supergroup 0 2019-12-11 11:02 /user/hive/warehouse/mydb.db/t1/pt_d=201701
-rwxr-xr-x 1 root supergroup 147 2019-12-11 11:02 /user/hive/warehouse/mydb.db/t1/pt_d=201701/myfile
--加载一个分区,会创建相应的目录
hive> alter table t1 add partition(pt_d ='3333');
--删除分区,会删除相应的文件(外部表不会删除,可以通过msck repair table table_name恢复)
alter table test1 drop partition (pt_d = ‘201701’);
--另外注意分区其实也是个字段,只不过把这个字段当作索引,通过建目录的方式,提高性能
hive> desc extended t1;
id int
name string
hobby array<string>
add map<string,string>
pt_d string
# Partition Information
# col_name data_type comment
pt_d string
#查看有多少分区
hive> show partitions t1;
pt_d=000000
pt_d=3333
修改表
--重命名
hive> alter table employes rename to employees;
--修改分区地址,无效
hive> alter table t1 partition(pt_d=3333) set location "hdfs://localhost:9000/user/hive/warehouse/mydb.db/t1/pt_d=4444";
--修改列,这边修改列为值,after某列之后,可以改成first,就是第一个位置,
--但是更改列顺序要求两列类型相同,
hive> create table src (c1 string,c1 string);
hive> alter table src change column c1 c3 string comment 'test' after c2;
--增加列
hive> alter table src add columns(c4 string comment 'column4');
--删除列,全替换的方式
hive> alter table src replace columns(cl1 string,cl2 string);
--修改表属性
hive> alter table src set tblproperties('name'='hujiawei');
--查看表属性
hive> show tblproperties src;
数据操作
装载数据

- local指定是从本地拷贝,如果没有local是从hdfs中移动(不可跨集群)
- overwrite指定是追加还是覆盖
静态分区和动态分区
- 静态分区方法1:

- 静态分区方法2:

- 动态分区

-
动态分区配置参数

从查询结果创建表
create table test as select c1,c2 from src;
--或者用like创建表
hive> create table test2 like test;
注意这种创建表,如果src表是从外部文件加载进来的表,会将src的数据文件移动到目标表的位置
exec.FileSinkOperator: Moving tmp dir: hdfs://localhost:9000/user/hive/warehouse/src/.hive-staging_hive_2019-12-10_16-11-45_214_2063272
320733291437-1/_tmp.-ext-10002 to: hdfs://localhost:9000/user/hive/warehouse/src/.hive-staging_hive_2019-12-10_16-11-45_214_2063272320733291437-1/-ext-10002
导出数据
--1,
hive> from test t
> insert overwrite local directory '/root/test/'
> select * ;
hive> ! ls /root/test;
000000_0
hive> ! cat /root/test/000000_0;
20191212
20180112
20190212
20190312
20190712
--2
hive> from test t
> insert overwrite local directory '/root/test/'
> select * ;
--3 直接拷贝hdfs中的文件
# 拷贝到本地
fs -get 'hdfs://localhost:9000/user/hive/warehouse/test/data' .
#拷贝到hdfs中另一个目录
hs -cp 'hdfs://localhost:9000/user/hive/warehouse/test/data' '/test/'
数据查询
查询是用正则表达式rlike
--hive的 正则表达式是用的java的,
select * from src a where a.s rlike '^a.*';
排序order,sort的区别
--order同oracle中的order,是全局排序,耗时多
select * from test a order by a.id1;
--sort by 是对每个reducer的输出排序,但是多个reducer整合的时候不一定排序正常
hive> select * from test a sort by a.id1;
--distribute by 用于对多个排序字段时,指定某个字段相同的给同一个reducer处理
hive> select * from test a distribute by a.id1 sort by a.id1,a.id2;
--cluster by 相当于distribute by order by 的组合,
hive> select * from test a cluster by a.id1;
cast 强制转换
hive> select cast(a.id1 as float) from test a ;
数据抽样,分桶
--将test 表数据随机分成2个桶,取其中一个桶
hive> select * from test tablesample(bucket 1 out of 2 on rand()) a;
--随机分3个桶,取第二个桶
hive> select * from test tablesample(bucket 2 out of 3 on rand());
--不是随机分桶,对列值分桶
hive> select * from test tablesample(bucket 2 out of 3 on id1);
--注意这边分桶其实不是均分的,所以每个桶中数据量不一定相同
视图
区别
-
逻辑视图和物理视图的区别
hive不支持物化视图,实质是将视图的定义语句和查询语句组合一起,供hive执行查询计划,所以视图只是一个逻辑视图,提供简化查询的功能 -
不能对列分权
orace中可以将指定列作为视图,达到控制权限,某些用户不需要ta_staff表的查询权限,只需要有视图的权限,可以看到某些列,但是hive中不支持对列分权,因为用户必须要有这个表的查询权限(文件访问权限)才能看视图但是hive中的视图可以通过where字句来限定某些行
hive> select * from test; OK 1 2 3 88 77 66 33 22 11 1 2 3 88 77 66 33 22 11 Time taken: 0.123 seconds, Fetched: 6 row(s) hive>关于hive的用户,角色,组的相关问题参考csdn1
索引
-
创建索引
hive> create index test1_index on table test(id1) as > 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' > with deferred rebuild > in table test_index_table; --bitmap索引适合值较少的列, create index index_test_2 on table test(id2) as 'BITMAP' with deferred rebuild in table test_index2_table ; -
删除索引
hive> drop index test1_index on test; -
查看索引
hive> show formatted index on test; -
重建索引
alter index test1_index on test rebuild; -
定制化索引
实现hive的接口,打包,添加后,在创建索引时用as指定类名具体见cwiki2
调优
explain
可以查看查询语句转换成map reduce的具体过程
explain select sum(id1) from test;
OK
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1
STAGE PLANS:
Stage: Stage-1
Map Reduce
Map Operator Tree:
TableScan
alias: test
Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: id1 (type: int)
outputColumnNames: id1
Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: sum(id1)
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
value expressions: _col0 (type: bigint)
Reduce Operator Tree:
Group By Operator
aggregations: sum(VALUE._col0)
mode: mergepartial
outputColumnNames: _col0
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink
Time taken: 0.31 seconds, Fetched: 44 row(s)
另外可以使用explain extended 获取更详细的信心
explain extended select sum(id1) from test;
OK
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1
STAGE PLANS:
Stage: Stage-1
Map Reduce
Map Operator Tree:
TableScan
alias: test
Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Select Operator
expressions: id1 (type: int)
outputColumnNames: id1
Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: sum(id1)
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
null sort order:
sort order:
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
tag: -1
value expressions: _col0 (type: bigint)
auto parallelism: false
Path -> Alias:
hdfs://localhost:9000/user/hive/warehouse/test [test]
Path -> Partition:
hdfs://localhost:9000/user/hive/warehouse/test
Partition
base file name: test
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE {
"BASIC_STATS":"true"}
bucket_count -1
column.name.delimiter ,
columns id1,id2,id3
columns.comments
columns.types int:int:int
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
location hdfs://localhost:9000/user/hive/warehouse/test
name default.test
numFiles 4
numRows 6
rawDataSize 42
serialization.ddl struct test { i32 id1, i32 id2, i32 id3}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 48
transient_lastDdlTime 1576480718
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE {
"BASIC_STATS":"true"}
bucket_count -1
column.name.delimiter ,
columns id1,id2,id3
columns.comments
columns.types int:int:int
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
location hdfs://localhost:9000/user/hive/warehouse/test
name default.test
numFiles 4
numRows 6
rawDataSize 42
serialization.ddl struct test { i32 id1, i32 id2, i32 id3}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 48
transient_lastDdlTime 1576480718
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.test
name: default.test
Truncated Path -> Alias:
/test [test]
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
aggregations: sum(VALUE._col0)
mode: mergepartial
outputColumnNames: _col0
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
GlobalTableId: 0
directory: hdfs://localhost:9000/tmp/hive/root/8d35f95d-893e-40ee-b831-6177341c7acb/hive_2019-12-17_11-12-25_359_70780381862038030-1/-mr-10001/.hive-staging_hive_2019-12-17_11-12-25_359_70780381862038030-1/-ext-10002
NumFilesPerFileSink: 1
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
Stats Publishing Key Prefix: hdfs://localhost:9000/tmp/hive/root/8d35f95d-893e-40ee-b831-6177341c7acb/hive_2019-12-17_11-12-25_359_70780381862038030-1/-mr-10001/.hive-staging_hive_2019-12-17_11-12-25_359_70780381862038030-1/-ext-10002/
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
properties:
columns _col0
columns.types bigint
escape.delim \
hive.serialization.extend.additional.nesting.levels true
serialization.escape.crlf true
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
TotalFiles: 1
GatherStats: false
MultiFileSpray: false
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink
Time taken: 0.324 seconds, Fetched: 119 row(s)
join优化
大表放join右边,小表放join左边
原因1:小表可以放在内存中缓存,用大表中记录挨个匹配小表的记录
实际原因是: 写在关联左侧的表每有1条重复的关联键时底层就会多1次运算处理
具体参考csdn3
本地模式
只针对小数据集,没有实际意义
并行优化
对job执行独立的阶段可以执行并行,提高速率
严格模式
严格模式下:
-
分区表下where必须指定分区
-
order by 必须加上limit
-
对笛卡尔乘积禁用
调整mapper和reducer的数量
根据输入和输出文件数量大小调整
JVM重用
可以减少新建task过程中初始化和销毁jvm的开销,缺点是耗时最长的task会长时间占用插槽,导致堵塞
索引
同oracle,对有索引列的条件查询会显著提高效率,但是维护索引会耗时,需要rebuild
分区
提升很明显,但是会导致namenode的文件过多,内存爆炸
推测执行
在分布式集群环境下,因为程序Bug(包括Hadoop本身的bug),负载不均衡或者资源分布不均等原因,会造成同一个作业的多个任务之间运行速度不一致,有些任务的运行速度可能明显慢于其他任务(比如一个作业的某个任务进度只有50%,而其他所有任务已经运行完毕),则这些任务会拖慢作业的整体执行进度。为了避免这种情况发生,Hadoop采用了推测执行(Speculative Execution)机制,它根据一定的法则推测出“拖后腿”的任务,并为这样的任务启动一个备份任务,让该任务与原始任务同时处理同一份数据,并最终选用最先成功运行完成任务的计算结果作为最终结果。
如果用户因为输入数据量很大而需要执行长时间的map或者Reduce task的话,那么启动推测执行造成的浪费是非常巨大大。
单个MR中多个group by
设置是否启用该功能,能将多个group by操作组装到单个MAP REDUCE中
虚拟列
用于诊断结果,通过参数配置开启
压缩
常用压缩格式
压缩能节省磁盘空间,提高文件传输速率,但是会消耗cpu加/解压缩的
-
对于量大但是不怎么计算的数据,一般用gzip(压缩比最高,压缩解压缩速度最慢)
-
对于量小但是经常需要计算的数据,一般用lzo或者snappy
配置
-
开启hadoop压缩格式
vi /opt/install/hadoop/hadoop-2.7.1/etc/hadoop/core-site.xml添加
<property> <name>io.compression.codecs</name> <value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec</value> </property>--配置后不需要重启hadoop就能够在hive中显示可用的压缩格式 hive> set io.compression.codecs; io.compression.codecs=org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec -
开启中间压缩

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RejrvQ5W-1577407917780)(…//pic/image-20191218101039825.png)] -
开启输出结果压缩

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-F4IMck81-1577407917781)(…//pic/image-20191218101113926.png)] -
使用sequence file

测试
-
使用中间结果压缩
hive> set hive.exec.compress.intermediate=true; hive> create table interemediate_com_om row format delimited fields terminated by '\t' as select * from test; Automatically selecting local only mode for query WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. Query ID = root_20191218101653_df7baa3b-9570-40a7-b480-be91907bfb1e Total jobs = 3 Launching Job 1 out of 3 Number of reduce tasks is set to 0 since there's no reduce operator Job running in-process (local Hadoop) 2019-12-18 10:17:03,294 Stage-1 map = 0%, reduce = 0% Ended Job = job_local1362494287_0001 Stage-4 is selected by condition resolver. Stage-3 is filtered out by condition resolver. Stage-5 is filtered out by condition resolver. Moving data to directory hdfs://localhost:9000/user/hive/warehouse/.hive-staging_hive_2019-12-18_10-16-53_073_1921170228225164888-1/-ext-10002 Moving data to directory hdfs://localhost:9000/user/hive/warehouse/interemediate_com_om MapReduce Jobs Launched: Stage-Stage-1: HDFS Read: 48 HDFS Write: 132 SUCCESS Total MapReduce CPU Time Spent: 0 msec OK Time taken: 11.773 seconds dfs -ls hdfs://localhost:9000/user/hive/warehouse/interemediate_com_om; Found 1 items -rwxr-xr-x 1 root supergroup 48 2019-12-18 10:17 hdfs://localhost:9000/user/hive/warehouse/interemediate_com_om/000000_0 hive> dfs -cat hdfs://localhost:9000/user/hive/warehouse/interemediate_com_om/*; 1 2 3 88 77 66 33 22 11 1 2 3 88 77 66 33 22 11 --最终结果仍然是文本格式 -
对输出结果用gzip压缩
hadoop中配置<property> <name>mapred.output.compress</name> <value>true</value> </property> <property> <name>mapred.compress.map.output</name> <value>true</value> </property> <property> <name>mapred.output.compression.codec</name> <value>org.apache.hadoop.io.compress.GzipCodec</value> </property>hive中执行
--检查压缩编码格式 hive> set mapred.output.compression.codec; mapred.output.compression.codec=org.apache.hadoop

这篇博客详尽地介绍了Hive的各种特性和使用技巧,包括Hive CLI的使用,数据类型,SQL操作,如数据库、表的管理,数据加载与查询,视图,索引和调优。此外,还深入探讨了压缩、开发、函数以及自定义记录格式,如sequenceFile和RCFile。最后,博客提供了内置函数大全和日志样例,是理解Hive操作和优化的宝贵资源。
最低0.47元/天 解锁文章
672

被折叠的 条评论
为什么被折叠?



