hive练习题
0. 连接hive及准备工作
${HIVE_HOME}/bin/hive启动hive客户端
或使用
$HIVE_HOME/bin/beeline -u “jdbc:hive2://cdh129135:10000/;principal=hive/cdh129135@MYCDH”
-- 可以先建测试库
create database public_train;
use public_train;
show tables;
-- 准备工作, 创建数据文件,这里字段分割符是Tab键,对应着表定义中的'\t'
vi stu.txt
1 xiapi
2 xiaoxue
3 qingqin
--上传文件到hdfs
hdfs dfs -mkdir -p /hive/stu
hdfs dfs -put -f stu.txt hdfs://192.168.129.130:8020/hive/stu/stu.txt
下面使用${HIVE_HOME}/bin/hive启动hive客户端进行演示:
1. 创建表并将数据文件导入表中
-- drop table test;
-- 建hive表对应的是数据仓库warehouse中的hdfs文件,所以需要指定文件中的字段分隔符和行分隔符
use public_train;
create table test (
id int,
name string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
-- 查看表字段
desc test;
-- 查看建表语句
show create table test;
show tables;
-- 查看表文件
$ hdfs dfs -ls hdfs://192.168.129.130:8020/user/hive/warehouse/public_train.db/test
-- 导入数据的语法为
-- LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE]
-- INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
-- 导入本地数据,需要加上LOCAL
load data LOCAL inpath '/home/public_train/stu.txt' INTO TABLE test;
select * from test;
-- 导入HDFS数据, 会将HDFS文件移动到hive数据仓库
load data inpath 'hdfs://192.168.129.130:8020/hive/stu/stu.txt' INTO TABLE test;
select * from test;
-- 查看表文件,增加了1个文件
$ hdfs dfs -ls hdfs://192.168.129.130:8020/user/hive/warehouse/public_train.db/test
-- hdfs文件已经不见了(被移动到hive数据仓库了)
$ hdfs dfs -ls hdfs://192.168.129.130:8020/hive/stu/stu.txt
-- OVERWRITE覆盖导入HDFS数据
load data inpath 'hdfs://192.168.129.130:8020/hive/stu/stu.txt' OVERWRITE INTO TABLE test;
select * from test;
-- 查看表文件,只剩下一个文件
$ hdfs dfs -ls hdfs://192.168.129.130:8020/user/hive/warehouse/public_train.db/test
2. 创建内部表和外部表external,分别load和drop后查看数据文件观察区别。
-- 外部表使用上面的test表即可
drop table stu ;
-- 使用external建外部表
create external table stu (
id int,
name string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
location 'hdfs://192.168.129.130:8020/hive/stu' ;
-- 查看建表语句
show create table stu;
select * from stu;
3. 基于现有表,创建新表
create table test_copy as select * from test;
4. 创建分区表,PARTITIONED by,导入分区数据
-- PARTITIONED by指定分区字段
create table logs(ts INT,line STRING)
PARTITIONED by(dt STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
-- 加载数据
load data local inpath '/home/public_train/part.txt' into table logs PARTITION (dt='2023-01-01');
-- 查看分区
show partitions logs;
-- 查看表文件(子目录名称即是分区)
dfs -ls /user/hive/warehouse/public_train.db/logs
select * from logs where dt='2023-01-01';
-- 动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
insert into logs partition (dt) values (8,'auto','2024-01-01');
dfs -ls /user/hive/warehouse/public_train.db/logs
show partitions logs;
5. 建分桶表
-- 强制分区及设置reduces数目为分区数
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;
create table bucketed_users (id INT,name STRING)
CLUSTERED BY (id) into 4 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
insert into table bucketed_users select ts, line from logs;
dfs -ls /user/hive/warehouse/public_train.db/bucketed_users;
表目录下有多个表文件:
-- 建分区分桶表
create table bucketed_logs(ts INT,line STRING)
PARTITIONED by(dt STRING)
CLUSTERED BY (ts) into 4 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table bucketed_logs select * from logs;
6. 导出数据到HDFS
-- 导出到HDFS:使用insert (overwrite) directory指令
insert overwrite directory '/hive/output' select * from stu;
-- 查看导出的HDFS文件
dfs -ls /hive/output;