Hive Shell命令
一、DDL操作
库操作
进入客户端:hive 创建普通库:create database dbname; 创建库的时候检查存与否:create database if not exists mydb; 创建库的时候带注释: create database if not exists dbname comment 'create my db named dbname'; 创建带属性的库: create database if not exists dbname with dbproperties ('a'='aaa','b'='bbb'); 查看库: show databases; show databases like 'my*'; 显示数据库的详细属性信息:desc database [extended] dbname; 查看正在使用哪个库:select current_database(); 查看创建库的详细语句:show create database mydb; 删除库操作: drop database dbname; drop database if exists dbname; 删除包含表的数据库:drop database if exists dbname cascade; 切换库:use mydb;
表操作
查看表结构: desc table_name; desc extended table_name; desc formatted table_name; 查看表列表: show tables; show tables in db_name; 使用 like 关键字拷贝表: create table new_table like mytable; create external table if not exists new_table like mytable; 查看 hive 函数列表:show functions; 创建内部表: create table if not exists my_user(id string, name string) create table my_user(id string, name string) row format delimited fields terminated by ','; create table mytable (id int, name string) row format delimited fields terminated by ',' stored as textfile; 创建外部表: create external table mytable2 (id int, name string) row format delimited fields terminated by ',' location '/user/hive/warehouse/mytable2'; 创建分区表: create table mytable3(id int, name string) partitioned by(sex string) row format delimited fields terminated by ',' stored as textfile; 创建分桶表: create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string) clustered by(Sno) sorted by(Sno DESC) into 4 buckets row format delimited fields terminated by ','; 修改表: 重命名表:ALTER TABLE table_name RENAME TO new_table_name 修改表属性:ALTER TABLE table_name SET TBLPROPERTIES ('comment' = 'my new students table'); 更改列分隔符:ALTER TABLE student SET SERDEPROPERTIES ('field.delim' = '-'); 增加/删除/改变/替换列: ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...]) ALTER TABLE name CHANGE c_name new_name new_type [FIRST|AFTER c_name] ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...]) 查询表分区: show partitions mytable3 show partitions table_name partition(city='beijing') 删除分区: ALTER TABLE student_p DROP if exists partition(part='aa'); ALTER TABLE student_p DROP if exists partition(part='aa') if exists partition(part='bb'); 添加分区: ALTER TABLE student_p ADD partition(part='a') partition(part='b'); ALTER TABLE student_p ADD IF NOT EXISTS partition(part='bb') location '/myhive_bb' partition(part='cc') location '/myhive_cc'; 修改分区路径: ALTER TABLE student_p partition (part='bb') SET location '/myhive_bbbbb'; 防止分区被删除: alter table student_p partition (part='aa') enable no_drop; 防止分区被查询: alter table student_p partition (part='aa') enable offline; 删除表: drop table if exists mytable; 清空表: truncate table student; truncate table student_ptn partition(city=’beijing’); 查看到 hive 执行的历史命令:cat /home/hadoop/.hivehistory 显示表头信息:set hive.cli.print.header=true;
二、DML操作
Load 装载数据
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION(partcol1=val1, partcol2=val2 ...)] 插入数据: insert into table my_user values ('1','huangbo'), ('2','xuzheng'), ('3','wangbaoqiang'); 插入男分区数据: load data local inpath '/root/hivedata/mingxing.txt' overwrite into table mytable3 partition(sex='boy'); 查询数据:select * from my_user; 导入数据: a) 导入 HDFS 数据:load data inpath '/user.txt' into table my_user; b) 导入本地数据:load data local inpath '/home/hadoop/user.txt' into table my_user;
Insert 插入数据
插入一条数据:INSERT INTO TABLE table_name VALUES(XX,YY,ZZ); 利用查询语句将结果导入新表: INSERT OVERWRITE [INTO] TABLE table_name [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement 多重插入: from mingxing insert into table mingxing2 select id,name,sex,age insert into table mingxing select id,name,sex ,age,department ; from student insert into table ptn_student partition(city='MA') select id,name,sex,age,department where department='MA' insert into table ptn_student partition(city='IS') select id,name,sex,age,department where department='IS'; insert into table ptn_student partition(city='CS') select id,name,sex,age,department where department='CS'; 动态分区插入: set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; 一个分区字段: insert into table test2 partition (age) select name,address,school,age from students; 多个分区字段: insert into table student_ptn2 partition(city='sa',zipcode) select id, name, sex, age,department, department as zipcode from studentss; 注意:查询语句 select 查询出来的动态分区 age 和 zipcode 必须放最后,和分区字段对应,不然结果会出错 CTAS(create table … as select …): CREATE TABLE mytest AS SELECT name, age FROM test;
Insert 导出数据
导出数据到本地: insert overwrite local directory '/home/hadoop/student.txt' select * from studentss; 导出数据到 HDFS: insert overwrite directory '/student' select * from studentss where age >= 20; insert overwrite directory 'hdfs://hadoop02:9000/user/hive/warehouse/mystudent' select * from studentss;
Select 查询数据
SELECT [ALL | DISTINCT] select_ condition, select_ condition, ... FROM table_name a [JOIN table_other b ON a.id = b.id] [WHERE where_condition] [GROUP BY col_list [HAVING condition]] [CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list | ORDER BY col_list] ] [LIMIT number] order by(字段) 全局排序 sort by(字段) 局部排序 distribute by(字段) cluster by(字段) 因此,如果分桶和 sort 字段是同一个时,此时,cluster by = distribute by + sort by 如果我们要分桶的字段和要排序的字段不一样, 那么我们就不能使用 clustered by 获取年龄大的三个学生: select id, age,name from student where stat_date= '20140101' order by age desc limit 3; 查询学生年龄按降序排序: set mapred.reduce.tasks=4; select id, age, name from student sort by age desc; select id, age, name from student order by age desc; select id, age, name from student distribute by age; 分桶和排序的组合操作,对 id 进行分桶,对 age,id 进行降序排序: insert overwrite directory '/root/outputdata6' select * from mingxing2 distribute by id sort by age desc, id desc; 分桶操作,按照 id 分桶,但是不进行排序: insert overwrite directory '/root/outputdata4' select * from mingxing2 distribute by id sort by age; 分桶操作,按照 id 分桶,并且按照 id 排序 insert overwrite directory '/root/outputdata3' select * from mingxing2 cluster by id; 分桶查询: 指定开启分桶: set hive.enforce.bucketing = true; // 在旧版本中需要开启分桶查询的开关 指定 reducetask 数量,也就是指定桶的数量 set mapreduce.job.reduces=4; insert overwrite directory '/root/outputdata3' select * from mingxing2 cluster by id; HQL 语句并不会转换成 MapReduce: 1、select * from student; // 简单读取表中文件数据时不会 2、where 过滤条件中只是分区字段时不会转换成 MapReduce 3、set hive.exec.mode.local.auto=true; // hive 会尝试使用本地模式执行
Hive Join 查询
1) 只支持等值链接,支持 and,不支持 or 2) 可以 join 多于 2 个表 3) Join 时,每次 map/reduce 任务的逻辑 实践中,应该把最大的那个表写在最后(否则会因为缓存浪费大量内存)。 4) HiveJoin 分三种:inner join, outer join, semi join inner join(内连接)(把符合两边连接条件的数据查询出来) HQL 语句:select * from tablea a inner join tableb b on a.id=b.id; left join(左连接,等同于 left outer join) 1、以左表数据为匹配标准,左大右小 2、匹配不上的就是 null 3、返回的数据条数与左表相同 HQL 语句:select * from tablea a left join tableb b on a.id=b.id; right join(右连接,等同于 right outer join) 1、以右表数据为匹配标准,左小右大 2、匹配不上的就是 null 3、返回的数据条数与右表相同 HQL 语句:select * from tablea a right join tableb b on a.id=b.id; left semi join(左半连接)(因为 hive 不支持 in/exists 操作(1.2.1 版本的 hive 支持 in 的操作),所以用该操作实现,并且是 in/exists 的高效实现) HQL 语句:select * from tablea a left semi join tableb b on a.id=b.id;
Hbase Shell命令
1.表操作
显示 hbase 中的表:list
创建 user 表,包含 info、data 两个列族:
create 'user', 'info1', 'data1'
create 'user', {NAME => 'info', VERSIONS => '3'}
向 user 表中插入信息,row key 为 rk0001,列族 info 中添加 name 列标示符,值为 zhangsan
put 'user', 'rk0001', 'info:name', 'zhangsan'
put 'user', 'rk0001', 'info:gender', 'female'
put 'user', 'rk0001', 'info:age', 20
put 'user', 'rk0001', 'data:pic', 'picture'
获取 user 表中 row key 为 rk0001 的所有信息
get 'user', 'rk0001'
get 'user', 'rk0001', 'info'
get 'user', 'rk0001', 'info:name', 'info:age'
获取 user 表中 row key 为 rk0001,列族为 info,版本号最新 5 个的信息
get 'user', 'rk0001', {COLUMN => 'info', VERSIONS => 2}
get 'user', 'rk0001', {COLUMN => 'info:name', VERSIONS => 5}
get 'user', 'rk0001', {COLUMN => 'info:name', VERSIONS => 5, TIMERANGE =>
[1392368783980, 1392380169184]}
获取 user 表中 row key 为 rk0001,cell 的值为 zhangsan 的信息
get 'people', 'rk0001', {FILTER => "ValueFilter(=, 'binary:图片')"}
获取 user 表中 row key 为 rk0001,列标示符中含有 a 的信息
get 'people', 'rk0001', {FILTER => "(QualifierFilter(=,'substring:a'))"}
2.scan操作
查询 user 表中的所有信息
scan 'user'
查询 user 表中列族为 info 的信息
scan 'user', {COLUMNS => 'info'}
scan 'user', {COLUMNS => 'info', RAW => true, VERSIONS => 5}
Scan 时可以设置是否开启 Raw 模式,开启 Raw 模式会返回包括已添加删除标记但是未实际删除的数据。
查询 user 表中列族为 info 和 data 且列标示符中含有 a 字符的信息
scan 'user', {COLUMNS => ['info', 'data'], FILTER => "(QualifierFilter(=,'substring:a'))"}
查询 user 表中列族为 info,rk 范围是[rk0001, rk0003)的数据
scan 'people', {COLUMNS => 'info', STARTROW => 'rk0001', ENDROW => 'rk0003'}
查询 user 表中 row key 以 rk 字符开头的
scan 'user',{FILTER=>"PrefixFilter('rk')"}
查询 user 表中指定范围的数据
scan 'user', {TIMERANGE => [1392368783980, 1392380169184]}
3.删除数据
删除 user 表 row key 为 rk0001,列标示符为 info:name 的数据
delete 'people', 'rk0001', 'info:name'
删除 user 表 row key 为 rk0001,列标示符为 info:name,timestamp 为 1392383705316 的数据
delete 'user', 'rk0001', 'info:name', 1392383705316
清空 user 表中的数据
truncate 'people'
4.修改表结构
停用 user 表
disable 'user'
添加两个列族 f1 和 f2
alter 'people', NAME => 'f1'
alter 'user', NAME => 'f2'
启用表
enable 'user'
删除一个列族:
alter 'user', NAME => 'f1', METHOD => 'delete' 或 alter 'user', 'delete' => 'f1'
添加列族 f1 同时删除列族 f2
alter 'user', {NAME => 'f1'}, {NAME => 'f2', METHOD => 'delete'}
将 user 表的 f1 列族版本号改为 5
alter 'people', NAME => 'info', VERSIONS => 5
删除记录
delete 'person', 'rk0001', 'info:name'