!connect jdbc:hive2://s01:10000
hive命令
dfs -lsr /; //执行dfs命令
!clear; //执行shell脚本
hive -e "select * from stu" //-e execute
hive -S -e "select * from stu" //-S 静默,不输出OD,.....
hive -f /x/x/x/x/a.sql //-f 执行一个文件,通常用于批处理
tab tab //查看所有命令
-- this is a comment ! //注释
set hive.cli.print.header=true; //显示字段名称
create database if not exists myhive1;
create database if not exists myhive1 with dbproperties('creator'='me', 'create_at'='2019-04-02 09:31:00');
alter database myhive1 with dbproperties('creator'='me');
create database if not exists myhive1;
drop database if not exists myhive1;
drop database if not exists myhive1 cascade;
show databases;
desc database myhive;
show tables;
show tables in myhive;
desc employee;
desc formatted employee;
desc extended employee;
1、创建表
CREATE TABLE IF NOT EXISTS myhive.employee (
id int COMMENT '员工编号',
name String COMMENT'员工姓名',
salary float,
subordinates ARRAY<String> COMMENT 'name od subordinates',
deductions MAP<String, Float>,
address STRUCT<street: String, city:String> COMMENT 'Home address'
)
COMMENT 'Description of the table'
TBLPROPERTIES ('creator'='me', 'create_at'='2019-04-02 09:31:00')
LOCATION '/user/hive/warehouse/myhive.db/employee'
2、内部表
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.] table_name
[(com_name data_type [COMMENT col_commet],...)]
[COMMENT table_commet]
[ROW FORMAT row_format]
[LINES FORMAT row_format]
[STORED AS file_format]
CREATE TABLE IF NOT EXISTS employee(eid int, name String, salary String)
COMMENT 'Employee details'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
insert into employee values(1,'zhangsan', '100');
2、数据加载
LOAD DATA [local] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION(partcol1=val1,partcol1=val1...)]
LOAD DATA local INPATH '/home/hadoop/app/hive/temp.txt' OVERWRITE INTO TABLE employee;
3,外部表
CREATE EXTERNAL TABLE IF NOT EXISTS stocks(id String, name String)
COMMENT 'external table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/home/data/stocks'
4、分区表
CREATE TABLE employee(
name String,
salary Float,
subordinates ARRAY<String>,
deductions MAP<String,Float>,
address STRUCT<street:String,city:String,state:String>
)
PARTITIONED BY (country String, city String);
create table myhive.test(id int, name string, age int)
partitioned by (province string,city string)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;
[加载数据到指定分区]
load data local inpath '/home/hadoop/app/hive/t.txt' into table myhive.test partition(province='sichuan',city='chengdu');
load data local inpath '/home/hadoop/app/hive/t.txt' into table myhive.test partition(province='chongqin',city='dazhou');
[查看分表数据]
select * from test where province='sichuan' and city='chengdu';
[分区表查询模式:strict / nostrict]
set hive.mapred.mode=strict;
[查看分区表有哪些分区]
show partitions myhive.test;
show partitions myhive.test partition(province='sichuan');
desc extended myhive.test;
[手动增加分区]
alter table myhive.test add partition(province='hunan',city='changsha');
alter table myhive.test drop partition(province='hubei');
[修改表属性]
alter table myhive.test set tblproperties('a'='x',...);
[启用归档]
set hive.archive.enabled=true; //默认为false
[复制数据到分区表]
insert into myhive.test partition(province='hunan',city='changsha') select * from myhive.test where province='sichuan' and city='chengdu';
[动态分区]
insert overwrite table myhive.test partition(province,city) select id,....province.city from table;
[修改表]
alter table myhive.test rename to myhive.test1; //重命名
alter table myhive.test add //添加多个分区
partition(province='hubei',city='wuhan') location ''
partition(province='hubei',city='wuhan') location ''
partition(province='hubei',city='wuhan') location '';
alter table myhive.test partition(province='hunan',city='changsha') set location ''; //移动分区
alter table myhive.test add columns(birth string, fire string); //增加列
alter table myhive.test change fire fire String; //修改类型
alter table myhive.test drop column fire; //删除列
alter table myhive.test replace columns(birth string, fire string); //增加列
5、聚合函数
select count(*) from test;
select sum(age) form test;
select avg(age) from test;
select max(age) from test;
select min(age) from test;
去重distinct
select count(dinstinct) from test;
表生成函数
select explode(array('tom','tomas','tomasLee'));
ascii函数,字符串收个字母ascii值
select ascii('abc');
base64字符串编码
select base64(bibary('http://localhost:8080/helloworld'));
binary函数,将字符串转换成二进制
select base64(bibary('http://localhost:8080/helloworld'));
binary函数,将字符串转换成二进制
select bibary('http://localhost:8080/helloworld');
分页查询
select * fromtest limit 1 , 1;
嵌套子查询
from (select * from test where province='hebei') e select e.id,e.name,e.age where e.city='baoding';
select e.id,e.name,e.age from (select * from test where province='hebei') e where e.city='baoding';
case..when then
select id,name, case when age<=12 then 'young'
when age>12 and age<=13 then 'middle'
when age>13 and age<=15 then 'old'
else 'too old'
end as yeaystate from test;
不能再where使用列别名
select id, name n age from test n like 't%';
范围运算
selec * from test where age<=14 and age>=12;
selec * from test where age between 12 and 14;
浮点数比较规避方案
select cast(0.2 as float);
group by查询
select province,count(*) from test group by province;
select province as c,count(*) from test group by province having c>3;
----合适hive可以避免MR操作-----
不是mr的job就是本地模式
1、全表扫描,没有where子句
select * from test
2、where子句只作用在分区字段
select * from test where province='hebei';
3、设置hive.exec.model.local.auto=true
该属性hive会尽力使用local模式查询
4、其余的所有查询都会转换为MR
6、关联查询
select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id=b.id;
连接查询优化手段,查询表的大小从左到右是递增的。
select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id=b.id; //right
select a.id,a.name,b.id,b.orderno,b.price from orders b join customers a on a.id=b.id; //wrong
使用查询暗示hint
select /*+streamtable(c)*/ a.id,a.name,b.id,b.orderno,b.price from orders b join customers a on a.id=b.id;
left outer join
select a.id,a.name,b.id,b.orderno,b.price from customers a left join orders b on a.id=b.id;
right oute join
select a.id,a.name,b.id,b.orderno,b.price from orders b right join customers a on a.id=b.id;
full outer join
select a.id,a.name,b.id,b.orderno,b.price from customers a full join orders b on a.id=b.id;
左半连接select和where子句不能应用到右表字段。
左表的记录在右表中一旦找到对应的记录,右表即停止扫描
select a.id,a.name,b.id,b.orderno,b.price from customers a left semi join orders b on a.id=b.id;
hive不支持右半连接操作right semi join
笛卡尔连接
select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id=b.id;
map端连接一张小表,通过mapper的时候,蒋小表完全载入内存。
暗示mapjoin(c)在0.7之前使用
select /*+mapjoin(c)*/a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id=b.id;
select /*+mapjoin(o)*/a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id=b.id;
set hive.auto.conert.join=true 转换连接,map端优化,在忧外连接和全外连接不支持
hive.mapjoin.smalltable.filesize=25000000 --设置小表阈值
order by全排序,对所有的数据通过一个reduce进行排序。
如果开启了hive.strict.mode=strict,在全排序时必须集合limit使用。
现在推荐使用hive.strict.checks.*属性。
select * from orders order by cid asc , price desc; --全局排序
sort by 每个reduce进行排序(局部排序)。
select * from orders sort by cid asc , price desc; --局部排序
distribute by 等价于自定义分区函数。
select * from orders distribute by cid asc order by price desc; --局部排序
7、分桶采样
select * from orders tablesample(bucket 3 out of 10 on number);
按照数据块百分比采样,100块抽取10块。如果总共一块,没有采样。
select * from orders tablesample(0.1 percent);
union all联合操作,字段类型和个数需要匹配。
select id,name from customers union all select id, orderno from orders;
8、视图
create view v_name as select ...
create view view1 as select a.id,a.name,b.id,b.orderno,b.price from customers a left join orders b on a.id=b.id;
通过视图查询
select * from view1;
select * from view1 where price > 200;
使用like方式创建view
create view v2 like view1;
删除视图
drop view if exists v2;