Shell命令

最新推荐文章于 2020-10-10 14:32:51 发布

成神之路

最新推荐文章于 2020-10-10 14:32:51 发布

阅读量284

点赞数 1

分类专栏：大数据文章标签： hive shell

本文链接：https://blog.csdn.net/weixin_42882274/article/details/81989712

版权

大数据专栏收录该内容

2 篇文章 0 订阅

订阅专栏

Hive Shell命令

一、DDL操作

库操作

进入客户端：hive
创建普通库：create database dbname;
创建库的时候检查存与否：create database if not exists mydb;
创建库的时候带注释：
create database if not exists dbname comment 'create my db named dbname';
创建带属性的库：
create database if not exists dbname with dbproperties ('a'='aaa','b'='bbb');

查看库：
show databases;
show databases like 'my*';
显示数据库的详细属性信息：desc database [extended] dbname;
查看正在使用哪个库：select current_database();
查看创建库的详细语句：show create database mydb;

删除库操作：
drop database dbname;
drop database if exists dbname;
删除包含表的数据库：drop database if exists dbname cascade;

切换库：use mydb;

表操作

查看表结构：
desc table_name;
desc extended table_name;
desc formatted table_name;
查看表列表：
show tables;
show tables in db_name;
使用 like 关键字拷贝表：
create table new_table like mytable;
create external table if not exists new_table like mytable;
查看 hive 函数列表：show functions;

创建内部表：
create table if not exists my_user(id string, name string)
create table my_user(id string, name string) row format delimited fields terminated by ',';
create table mytable (id int, name string) row format delimited fields terminated by ',' stored as textfile;
创建外部表：
create external table mytable2 (id int, name string) row format delimited fields terminated by ',' location '/user/hive/warehouse/mytable2';
创建分区表：
create table mytable3(id int, name string) partitioned by(sex string) row format delimited fields terminated by ',' stored as textfile;
创建分桶表：
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno) sorted by(Sno DESC) into 4 buckets
row format delimited fields terminated by ',';

修改表：
重命名表：ALTER TABLE table_name RENAME TO new_table_name
修改表属性：ALTER TABLE table_name SET TBLPROPERTIES ('comment' = 'my new students table');
更改列分隔符：ALTER TABLE student SET SERDEPROPERTIES ('field.delim' = '-');
增加/删除/改变/替换列：
ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name CHANGE c_name new_name new_type [FIRST|AFTER c_name]
ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])
查询表分区：
show partitions mytable3
show partitions table_name partition(city='beijing')
删除分区：
ALTER TABLE student_p DROP if exists partition(part='aa');
ALTER TABLE student_p DROP if exists partition(part='aa') if exists partition(part='bb');
添加分区：
ALTER TABLE student_p ADD partition(part='a') partition(part='b');
ALTER TABLE student_p ADD IF NOT EXISTS partition(part='bb') location '/myhive_bb' partition(part='cc') location '/myhive_cc';
修改分区路径：
ALTER TABLE student_p partition (part='bb') SET location '/myhive_bbbbb';
防止分区被删除：
alter table student_p partition (part='aa') enable no_drop;
防止分区被查询：
alter table student_p partition (part='aa') enable offline;

删除表：
drop table if exists mytable;

清空表：
truncate table student;
truncate table student_ptn partition(city=’beijing’);

查看到 hive 执行的历史命令：cat /home/hadoop/.hivehistory
显示表头信息：set hive.cli.print.header=true;

二、DML操作

Load 装载数据

LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION(partcol1=val1, partcol2=val2 ...)]

插入数据：
insert into table my_user values ('1','huangbo'), ('2','xuzheng'), ('3','wangbaoqiang');
插入男分区数据：
load data local inpath '/root/hivedata/mingxing.txt' overwrite into table mytable3 partition(sex='boy');

查询数据：select * from my_user;

导入数据：
a) 导入 HDFS 数据：load data inpath '/user.txt' into table my_user;
b) 导入本地数据：load data local inpath '/home/hadoop/user.txt' into table my_user;

Insert 插入数据

插入一条数据：INSERT INTO TABLE table_name VALUES(XX,YY,ZZ);

利用查询语句将结果导入新表：
INSERT OVERWRITE [INTO] TABLE table_name [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement

多重插入:
from mingxing
insert into table mingxing2 select id,name,sex,age
insert into table mingxing select id,name,sex ,age,department ;

from student
insert into table ptn_student partition(city='MA') select id,name,sex,age,department where
department='MA'
insert into table ptn_student partition(city='IS') select id,name,sex,age,department where
department='IS';
insert into table ptn_student partition(city='CS') select id,name,sex,age,department where
department='CS';

动态分区插入：
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
一个分区字段：
insert into table test2 partition (age) select name,address,school,age from students;
多个分区字段：
insert into table student_ptn2 partition(city='sa',zipcode) 
select id, name, sex, age,department, department as zipcode from studentss;
注意：查询语句 select 查询出来的动态分区 age 和 zipcode 必须放最后，和分区字段对应，不然结果会出错

CTAS（create table … as select …）:
CREATE TABLE mytest AS SELECT name, age FROM test;

Insert 导出数据

导出数据到本地：
insert overwrite local directory '/home/hadoop/student.txt' select * from studentss;
导出数据到 HDFS：
insert overwrite directory '/student' select * from studentss where age >= 20;
insert overwrite directory 'hdfs://hadoop02:9000/user/hive/warehouse/mystudent' select * from studentss;

Select 查询数据

SELECT [ALL | DISTINCT] select_ condition, select_ condition, ...
FROM table_name a
[JOIN table_other b ON a.id = b.id]
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list | ORDER BY col_list] ]
[LIMIT number]

order by(字段) 全局排序
sort by(字段) 局部排序
distribute by(字段)
cluster by(字段)
因此，如果分桶和 sort 字段是同一个时，此时，cluster by = distribute by + sort by 如果我们要分桶的字段和要排序的字段不一样，
那么我们就不能使用 clustered by 

获取年龄大的三个学生：
select id, age,name from student where stat_date= '20140101' order by age desc limit 3;
查询学生年龄按降序排序：
set mapred.reduce.tasks=4;
select id, age, name from student sort by age desc;
select id, age, name from student order by age desc;
select id, age, name from student distribute by age;
分桶和排序的组合操作，对 id 进行分桶，对 age，id 进行降序排序：
insert overwrite directory '/root/outputdata6' select * from mingxing2 distribute by id sort by age desc, id desc;
分桶操作，按照 id 分桶，但是不进行排序：
insert overwrite directory '/root/outputdata4' select * from mingxing2 distribute by id sort by age;
分桶操作，按照 id 分桶，并且按照 id 排序
insert overwrite directory '/root/outputdata3' select * from mingxing2 cluster by id;
分桶查询：
指定开启分桶：
set hive.enforce.bucketing = true; // 在旧版本中需要开启分桶查询的开关
指定 reducetask 数量，也就是指定桶的数量
set mapreduce.job.reduces=4;
insert overwrite directory '/root/outputdata3' select * from mingxing2 cluster by id;

HQL 语句并不会转换成 MapReduce：
1、select * from student; // 简单读取表中文件数据时不会
2、where 过滤条件中只是分区字段时不会转换成 MapReduce
3、set hive.exec.mode.local.auto=true; // hive 会尝试使用本地模式执行

Hive Join 查询

1) 只支持等值链接，支持 and，不支持 or
2) 可以 join 多于 2 个表
3) Join 时，每次 map/reduce 任务的逻辑
    实践中，应该把最大的那个表写在最后（否则会因为缓存浪费大量内存）。
4) HiveJoin 分三种：inner join, outer join, semi join

inner join（内连接）（把符合两边连接条件的数据查询出来）
HQL 语句：select * from tablea a inner join tableb b on a.id=b.id;

left join（左连接，等同于 left outer join）
1、以左表数据为匹配标准，左大右小
2、匹配不上的就是 null
3、返回的数据条数与左表相同
HQL 语句：select * from tablea a left join tableb b on a.id=b.id;

right join（右连接，等同于 right outer join）
1、以右表数据为匹配标准，左小右大
2、匹配不上的就是 null
3、返回的数据条数与右表相同
HQL 语句：select * from tablea a right join tableb b on a.id=b.id;

left semi join（左半连接）（因为 hive 不支持 in/exists 操作（1.2.1 版本的 hive 支持
in 的操作），所以用该操作实现，并且是 in/exists 的高效实现）
HQL 语句：select * from tablea a left semi join tableb b on a.id=b.id;

Hbase Shell命令

1.表操作

显示 hbase 中的表：list
创建 user 表，包含 info、data 两个列族：
create 'user', 'info1', 'data1'
create 'user', {NAME => 'info', VERSIONS => '3'}
向 user 表中插入信息，row key 为 rk0001，列族 info 中添加 name 列标示符，值为 zhangsan
put 'user', 'rk0001', 'info:name', 'zhangsan'
put 'user', 'rk0001', 'info:gender', 'female'
put 'user', 'rk0001', 'info:age', 20
put 'user', 'rk0001', 'data:pic', 'picture'
获取 user 表中 row key 为 rk0001 的所有信息
get 'user', 'rk0001'
get 'user', 'rk0001', 'info'
get 'user', 'rk0001', 'info:name', 'info:age'
获取 user 表中 row key 为 rk0001，列族为 info，版本号最新 5 个的信息
get 'user', 'rk0001', {COLUMN => 'info', VERSIONS => 2}
get 'user', 'rk0001', {COLUMN => 'info:name', VERSIONS => 5}
get 'user', 'rk0001', {COLUMN => 'info:name', VERSIONS => 5, TIMERANGE =>
[1392368783980, 1392380169184]}
获取 user 表中 row key 为 rk0001，cell 的值为 zhangsan 的信息
get 'people', 'rk0001', {FILTER => "ValueFilter(=, 'binary:图片')"}
获取 user 表中 row key 为 rk0001，列标示符中含有 a 的信息
get 'people', 'rk0001', {FILTER => "(QualifierFilter(=,'substring:a'))"}

2.scan操作

查询 user 表中的所有信息
scan 'user'
查询 user 表中列族为 info 的信息
scan 'user', {COLUMNS => 'info'}
scan 'user', {COLUMNS => 'info', RAW => true, VERSIONS => 5}
Scan 时可以设置是否开启 Raw 模式，开启 Raw 模式会返回包括已添加删除标记但是未实际删除的数据。
查询 user 表中列族为 info 和 data 且列标示符中含有 a 字符的信息
scan 'user', {COLUMNS => ['info', 'data'], FILTER => "(QualifierFilter(=,'substring:a'))"} 
查询 user 表中列族为 info，rk 范围是[rk0001, rk0003)的数据
scan 'people', {COLUMNS => 'info', STARTROW => 'rk0001', ENDROW => 'rk0003'}
查询 user 表中 row key 以 rk 字符开头的
scan 'user',{FILTER=>"PrefixFilter('rk')"}
查询 user 表中指定范围的数据
scan 'user', {TIMERANGE => [1392368783980, 1392380169184]}

3.删除数据

删除 user 表 row key 为 rk0001，列标示符为 info:name 的数据
delete 'people', 'rk0001', 'info:name'
删除 user 表 row key 为 rk0001，列标示符为 info:name，timestamp 为 1392383705316 的数据
delete 'user', 'rk0001', 'info:name', 1392383705316
清空 user 表中的数据
truncate 'people'

4.修改表结构

停用 user 表
disable 'user'
添加两个列族 f1 和 f2
alter 'people', NAME => 'f1'
alter 'user', NAME => 'f2'
启用表
enable 'user'
删除一个列族：
alter 'user', NAME => 'f1', METHOD => 'delete' 或 alter 'user', 'delete' => 'f1'
添加列族 f1 同时删除列族 f2
alter 'user', {NAME => 'f1'}, {NAME => 'f2', METHOD => 'delete'}
将 user 表的 f1 列族版本号改为 5
alter 'people', NAME => 'info', VERSIONS => 5
删除记录
delete 'person', 'rk0001', 'info:name'