hive知识点小结

最新推荐文章于 2024-07-24 20:46:25 发布

biwenjun999

最新推荐文章于 2024-07-24 20:46:25 发布

阅读量1.8k

点赞数

分类专栏：大数据文章标签： hive hadoop

本文链接：https://blog.csdn.net/biwenjun999/article/details/53148381

版权

大数据专栏收录该内容

12 篇文章 2 订阅

订阅专栏

hive
------------------
数据仓库，OLAP，分析处理，存储和分析，延迟较高。
数据库: OLTP,在线事务处理，低延迟，事务支持。
运行在hadoop，类SQL方法方式运行，SQL(HiveQL,HQL),MR运算。
操纵的结构化数据。
schema(模式，元信息存放到数据库中)，HDFS文件。derby，mysql。
数据库和表都是路径。

hive
------------------
类似mysql

[配置hive]
1.conf/hive-env.sh
HADOOP_HOME=... //不配也可以
2.conf/hive-site.xml
${system:java.io.tmpdir} //配置本地临时目录

$hive>schematool -initSchema --dbtype derby //初始化模式
$hive>hive //初始化模式

hive常用命令
----------------------
$hive>!clear ; //hive中执行shell命令
$hive>!dfs -lsr / ; //hive中执行hdfs命令
$hive>create table hive1.t as select hive2.t2 ; //复制表

将hive中的schema存放到外部的mysql中
---------------------------------------
1.编写hive-site.xml，添加mysql连接信息
[hive/conf/hive-site.xml]
...
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.231.1:3306/myhive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
2.在mysql中创建myhive数据库
$hive>create database myhive ;

3.mysql驱动程序(jar)放置到hive classpath下。
...
4.重新初始化hive schema元数据库。
$>hive/bin/shematool -initschema --dbtype mysql

启动hiveserver2服务,接收多个客户端连接请求,
使得client通过jdbc连接操纵hive数据仓库
--------------------------------------------
$>hive/bin/hive --service hiveserver2 start //启动服务
$>netstat -ano | grep 10000 //查看端口

在eclipse中创建maven(暂时不用)项目
----------------------------------
1.创建java项目
2.引入外部jar包
181个
3.修改hive-site.xml配置文件
使用OS操作系统的认证方式。
[hive-site.xml]
hive.server2.enable.doAs=false
hive.metastore.sasl.enabled=false
hive.server2.authentication=NONE
3'.重启hiveserver2服务器
$hive>hive --service hiveserver2 stop
$hive>hive --service hiveserver2 start &
$hive>netstat -ano | grep 10000 //验证是否启动10000端口

4.编写App程序
public static void main(String[] args) throws Exception {
Class.forName("org.apache.hive.jdbc.HiveDriver");
Connection conn = DriverManager.getConnection("jdbc:hive2://192.168.231.100:10000/hive1","ubuntu","123456");
PreparedStatement ppst = conn.prepareStatement("select * from t");
ResultSet rs = ppst.executeQuery();
while(rs.next()){
int id = rs.getInt("id");
String name = rs.getString("name");
int age = rs.getInt("age");
System.out.println(id + "," + name + "," + age);
}
rs.close();
ppst.close();
conn.close();
}

5.常用聚集函数
count()
sum()
avg()
max()
min()

6.解决beeline命令行终端的上下键导航历史命令的bug
[bin/beeline]
修改行
if [[ ! $(ps -o stat= -p $$) =~ + ]]; then
为
if [[ ! $(ps -o stat= -p $$) =~ "+" ]]; then

hive命令
-------------
$hive>dfs -lsr / //执行dfs命令
$hive>!clear ; //执行shell脚本
$>hive -e "select * from test" //-e execute
$>hive -S -e "select * from test" //-S 静默,不输出OK,...
$>hive -f /x/x/x/a.sql //-f 执行一个文件,通常用于批处理
$hive>tab tab //显示所有命令
$hive>-- this is a comment ! //显示所有命令
$hive>set hive.cli.print.header=true; //现在字段名称(头)
$hive>create database if not exists xxx //
$hive>create database hive3 with dbproperties('author'='xupc','createtime'='today')
$hive>alter database hivee3 set dbproperties('author'='you')

[创建表语法]
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[ROW FORMAT row_format]
[STORED AS file_format]
[创建表例子]
CREATE TABLE IF NOT EXISTS employee( eid int, name String, salary String, destination String)
COMMENT 'Employee details' //注释
PARTITION BY (PNAME PTYPE,...)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t' //字段结束符
LINES TERMINATED BY '\n' //行结束符
STORED AS TEXTFILE; //存储成何种文件
[创建表带分区]
create table hive1.test5(id int ,name string ,age int ) partitioned by (province string , city string) row format DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;

[加载数据===insert]
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA LOCAL INPATH '/home/user/sample.txt' OVERWRITE INTO TABLE employee;

[创建分区表]
create table test5(id int,name string,age int) partitioned by (province string,city string); //按照省份和城市分区
[加载数据到指定分区]
load data local inpath '/home/ubuntu/employees.txt' into table hive1.test5 partition(province='hebei',citry='baoding');
[查看hdfs]
/user/hive/warehouse/hive1.db/test5/province=hebei/city=baoding/employees.txt //
[查询分区表]
$hive>select * from hive1.test5 where province = 'hebei' and city = 'baoding';
[分区表的查询模式:strict / nostrict]
$hive>set hive.mapred.mode=strict //严格模式,默认是nostrict

[查看分区表有哪些分区]
$hive>show partitions hive1.test5 ;
$hive>show partitions hive1.test5 partition(province='hebei') ; //查看具体分区的细节信息
$hive>desc extended hive1.test5 ; //查看扩展表信息
[手动增加分区]
$hive>alter table hive1.test5 add partition(province='hanan',city='pingdingshan') //
$hive>alter table hive1.test5 add partition(area='huabei',province='hanan',city='pingdingshan') //增加不存在的分区列，是非法的。

[修改表]
$hive>alter table hive1.test5 rename to hive1.test6 ; //重命名
$hive>alter table hive1.test5 add partion(province='hebei',city='zhangjiakou') location 'xxx' //添加多个分区
partion(province='hebei',city='zhangjiakou')
partion(province='hebei',city='zhangjiakou')
partion(province='hebei',city='zhangjiakou')
$hive>alter table hive1.test5 partition(province=..,city=..) set location 'xxxx' //移动分区

$hive>alter table hive1.test5 change name string //??????
$hive>alter table hive1.test5 add columns(birth string , fire string); //增加列
$hive>alter table hive1.test5 replace columns(birth string , fire string); //增加列

[修改表属性]
$hive>alter table hive1.test5 set tblproperties('a'='x',...) //修改表属性
[启用归档]
$hive>set hive.archive.enabled=true //设置可以归档,默认false????

[复制数据到分区表]
$hive>insert into hive1.test5 partition(province='hebei',city='baoding') select * from hive1.test2 ;//
insert into hive1.test6 partition(province='hebei',city='baoding') select * from hive1.test6 //字段个数相同
where province='hebei' and city='shijiazhuang' and id > 5 ; //查询时，分区通过where子句指定，
//插入时，分区用partition(..)指定
[动态分区]
$hive>insert overwrite table hive1.test6 partition(province,city) select id,...,province ,city //动态分区.
from table2 ;

$hive>create table hive1.test1(id int) tblproperties('author'='xupc');
//创建表，指定属性
$hive>create table hive1.test1(id int) LOCATION '/user/ubuntu/test1'

$hive>create external table hive1.test3 like hive1.test1 ; //创建外部表external,只复制表结构，没有数据。
$hive>create external table hive1.test4 as select * from hive1.test1; //创建外部表external,只复制表结构，有数据。

$hive>desc extended hive1.test1; //显示扩展信息
$hive>desc formatted hive1.test1; //显示格式化的信息
$hive>create table hive2.test4 like hive1.test1; //复制表
$hive>show tables in hive1; //显示指定数据库的表集合,默认是当前库.

$hive>hive>drop database if exists xxx //存在即删除
$hive>hive>drop database if exists xxx //存在即删除
$hive>hive>drop database if exists xxx cascde //级联删除
$hive>hive>create database hive2 location '/user/ubuntu/';
$hive>hive>desc[ribe] database hive2 //显示db信息,不包含扩展信息
$hive>hive>desc[ribe] database extended hive3 //显示db信息,不包含扩展信息
$hive>hive>use hive3; //使用哪个库

分区表
-----------------

托管表
----------------
hive默认表都是托管表。hive控制其数据的生命周期。删除托管表时，元数据和数据都被删除。

外部表
---------------
hive控制元数据。删除托管表时，数据不被删除。
create external table hive1.test3 like hive1.test1 ;

使用beeline客户端可以实现远程jdbc连接
--------------------------------------
1.连接
$>hive --service beeline -u jdbc:hive2://s100:10000/hive1
$>beeline -u jdbc:hive2://s100:10000/hive1
$beenline>
$beenline>!sh clear ; //执行shell脚本
$beenline>show databases; //查看库
$beenline>!help; //帮助
$beenline>!dbinfo //帮助

配置hive的仓库位置
---------------------
[hive-site.xml]
hive.metastore.warehouse.dir=/user/hive/warehouse/

hive数据类型
----------------------
类型 Size 案例
TINYINT 1 byte signed integer. 20 //byte
SMALLINT 2 byte signed integer. 20 //short
INT 4 byte signed integer. 20 //int
BIGINT 8 byte signed integer. 20 //long
BOOLEAN Boolean true or false. TRUE //boolean
FLOAT Single precision floating point. 3.14159 //float
DOUBLE Double precision floating point. 3.14159 //double

STRING 'Now is the time', "for all good men" //字符串'' / ""
TIMESTAMP
BINARY 字节数组

[集合类型]
STRUCT struct('John', 'Doe')
MAP map('first', 'John','last', 'Doe')
ARRAY array('John', 'Doe')

Hive所谓的读模式
-----------------------
hive在写操作是不校验，读时校验。

-----------------------------------------------------
创建分区表
------------
create external table hive1.test2(id int , name string ,age int)
partitioned by(province string , city string)
row format DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

手动添加分区
-------------
alter table hive1.test2 add partition(province='hebei',city='baoding')

插入数据到分区表
----------------
insert into hive1.test2 partition(province='hebei',city='baoding')
select * from hive1.test6 where province='hebei' and city='shijiazhuang' and id > 5 ;

动态分区
----------------
$hive>-- 创建test3分区表
$hive>create table test3(id int,name string,age int)
partitioned by (province string,city string)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile ;

$hive>-- 动态分区,复制一个表数据到分区表，动态创建分区
$hive>-- 如果两个都是动态分区需要关闭严格模式
$hive>set hive.exec.dynamic.partition.mode=nonstrict; //关闭动态分区的严格模式
$hive>insert into hive1.test2 partition(province,city) //
select id,name,age, 'henan' as province , 'kaifeng' as city
from table2 ;

$hive>-- 使用分区动静混合
$hive>insert into hive1.test2 partition(province='henan',city)
select id,name,age, 'henan' as province , 'kaifeng' as city
from table2 ;

$hive>-- 查询期间，动态创建表，并将数据写入创建表中
$hive>create table test3 as select id,name from test2 where province = 'hebei' and city = 'baoding';

$hive>-- 导出hive数据到本地目录(下载)
$hive>insert overwrite local directory '/home/ubuntu/hive' select * from test2 where province = 'hebei';

$hive>-- 导出hive数据到HDFS目录
$hive>insert overwrite directory 'hdfs://s100:8020/user/ubuntu/xxx' select * from test2 where province = 'hebei';

$>-- 查询数据向多个目录同时输出??????????????????
$>from test2 t
insert overwrite local directory '/home/ubuntu/hebei' select * where t.province = 'hebei'
insert overwrite local directory '/home/ubuntu/henan' select * where t.province = 'nanan' ;

查询
-------------------------------
$>-- 查询,投影查询，指定表的别名
$>select col1,col2,... from table t ;

$>-- 查询,使用函数
$>select upper(name) from test2 ;
$>select lower(name) from test2 ;

$>-- 数学函数
$>select round(12.345) ; //四舍五入
$>select floor(12.345) ; //地板
$>select ceil(12.345) ; //天花板
$>select rand(10) ; //随机数

$>-- 聚合函数
$>select count(*) from test2 ;
$>select sum(age) from test2 ;
$>select avg(age) from test2 ;
$>select max(age) from test2 ;
$>select min(age) from test2 ;

$>-- 去重.distinct
$>select count(dinstinct name) from test2 ;

$>-- 表生成函数
$>select explode(array('tom','tomas','tomsLee')) ;

$>-- ascii函数,字符串首个字母ascii值
$>select ascii('abc') ;

$>-- base64字符串编码
$>select base64(binary('httpt://localhost:8080/helloworld')) ;//输出select base64(binary('httpt://localhost:8080/helloworld')) ;

$>-- binary函数，将字符串转换成二进制
$>select base64(binary('httpt://localhost:8080/helloworld')) ;//输出select base64(binary('httpt://localhost:8080/helloworld')) ;

$>-- binary函数，将字符串转换成二进制
$>select binary('httpt://localhost:8080/helloworld') ;

$>-- 类型转换
$>select cast('120' as bigint) + 200 ; //320

$>-- 字符串连接
$>select concat('120',200) ; //120200

$>-- 分页查询limit
$>select * from test2 limit 1,1 ; //offset , length

$>-- 嵌套子查询
$>from (select * from test2 where province = 'hebei') e select e.id,e.name,e.age where e.city = 'baoding'; //OK
$>select e.id,e.name,e.age from (select * from test2 where province = 'hebei') e where e.city = 'baoding'; //OK

$>case .. when then 相当于if
$>select id,name,case when age <= 12 then 'young'
when age > 12 and age <= 13 then 'middle'
when age > 13 and age <= 15 then 'old'
else 'too old'
end as yearstate from test2 ;

$>-- 不能在where使用列别名
$>select id,name n ,age from test2 where n like 't%' ; //wrong,where中不能字段别名。

$>-- 范围运算
$>select id,name n ,age from test2 where age <= 14 and age >= 12 ;
$>select id,name n ,age from test2 where age between 12 and 14 ;

$>-- 浮点数比较的规避方案
$>select cast(0.2 as float) ;

$>-- group by查询
$>select count(*),province from test2 group by province ; //
$>select count(*) as c ,province from test2 group by province having c > 3 ; //having是组内过滤

[Hive的Join操作,只支持等值连接]
$>-- 创建customers和orders表，一对多关系。

$>-- customers
$>create table customers(id int , name string , age int)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile ;

$>-- orders
$>create table orders(id int , orderno string , price float , cid int)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile ;

$>-- 准备数据
customers.txt数据
------------------
1 tom1 12
2 tom2 13
3 tom3 14

orders.txt数据
-------------------
1 No001 121.34 1
2 No002 121.35 1
3 No003 121.00 1
4 No004 22.66 2
5 No005 300.65 2
6 No006 800.56 2
7 No007 1000.12

$>-- 加载数据到customers + orders中.
$>load data local inpath '/home/ubuntu/customers.txt' into table hive1.customers;
$>load data local inpath '/home/ubuntu/orders.txt' into table hive1.orders;

$>select * from customers;

$> --内连接 join .. on
$>select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id = b.cid ;
1 tom1 1 No001 121.34
1 tom1 2 No002 121.35
1 tom1 3 No003 121.0
2 tom2 4 No004 22.66
2 tom2 5 No005 300.65
2 tom2 6 No006 800.56

$>-- 连接查询优化手段：查询表的大小从左到右是递增的。
$>select c.id,c.name,c.age,o.orderno,o.price from customers c join orders o on c.id = o.cid where ...; //right
$>select c.id,c.name,c.age,o.orderno,o.price from orders o join customers c on c.id = o.cid where ...; //wrong

$>-- 使用查询暗示 hint
$>select /*+streamtable(c)*/ c.id,c.name,c.age,o.orderno,o.price from orders o join customers c on c.id = o.cid;

$>-- left outer join
$>select c.id,c.name,c.age,o.orderno,o.price from customers c left outer join order o on c.id = o.cid ;
1 tom1 1 No001 121.34
1 tom1 2 No002 121.35
1 tom1 3 No003 121.0
2 tom2 4 No004 22.66
2 tom2 5 No005 300.65
2 tom2 6 No006 800.56
3 tom3 NULL NULL NULL

$>-- right outer join
$>select c.id,c.name,c.age,o.orderno,o.price from customers a right outer join order b on c.id = o.cid ;
1 tom1 1 No001 121.34
1 tom1 2 No002 121.35
1 tom1 3 No003 121.0
2 tom2 4 No004 22.66
2 tom2 5 No005 300.65
2 tom2 6 No006 800.56
NULL NULL 7 No007 1000.12

$>-- full outer join
$>select c.id,c.name,c.age,o.orderno,o.price from customers a full outer join order b on c.id = o.cid ;
NULL NULL 7 No007 1000.12
1 tom1 3 No003 121.0
1 tom1 2 No002 121.35
1 tom1 1 No001 121.34
2 tom2 6 No006 800.56
2 tom2 5 No005 300.65
2 tom2 4 No004 22.66
3 tom3 NULL NULL NULL

$>-- 左半连接,select和where子句不能引用到右边表字段
$>-- 左表的记录在右表中一旦找到对应的记录，右侧表即停止扫描。
$>select c.id,c.name from customers c left semi join orders o on c.id = o.cid ;

$>-- hive不支持右半连接操作. right semi join xxxx

$>-- 笛卡尔连接 m x n
$>select c.id,c.name,o.orderno from customers c join orders o ;

$>-- map端连接,一张小表，通过mapper的时候，将小表完全载入内存。
$>-- 暗示 mapjoin(c)在0.7之前使用。
$>select /*+mapjoin(c)*/ c.id,c.name,o.orderno from customers c join orders o ;
$>select /*+mapjoin(o)*/ c.id,c.name,o.orderno from customers c join orders o ;

$>set hive.auto.conert.join=true --转换连接,map端优化，在右外链接和全外连接中不支持
$>hive.mapjoin.smalltable.filesize=25000000 --设置小表阀值

$>-- order by 全排序，对所有数据通过一个reduce进行排序。
$>-- 如果开启了hive.mapred.mode=strict,在全排序时必须集合limit使用。
$>-- 现在推荐使用hive.strict.checks.*属性.
$>select * from orders order by cid asc , price desc ; --全局排序

$>-- sort by 每个reduce进行排序(局部排序)。
$>select * from orders sort by cid asc , price desc ; --局部排序

$>-- distribute by等价于自定义分区函数,写在sort by之前.
$>select * from orders distribute by cid sort by price desc ; --局部排序

$>--cluster by === distribute by ... sort by ...

$>分桶采样
$>select * from orders tablesample(bucket 3 out of 10 on number) ;

$>-- 按照数据块百分比采样，100块,抽取10块.,如果总共1块，没有采样。
$>select * from orders tablesample(0.1 percent) ;

$>-- union all 联合操作,字段类型和个数需要匹配。
$>select id , name from customers union all select id ,orderno from orders ;

$>-- view(虚表),降低查询的复杂度
$>-- create view v_name as select ...
$>create view view1 as select c.id,c.name,c.age,o.id,o.prderno,o.price --创建视图
from customers c left outer join orders o on c.id = o.cid ;

$>-- 通过视图直接查询
$>select * from view1 ;
$>select * from view1 where price > 200 ;

$>-- 使用like方式创建view
$>create view v2 like view1 ;

$>--删除视图
$>drop view if exists v2;

$>--索引,hive没有key(primary key + auto_increment)
$>--创建索引,DEFERRED REBUILD该选项时，索引为空白状态，需要rebuild才能够初始化。
$>CREATE INDEX idx_customers_id ON TABLE customers (id) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD IDXPROPERTIES ('creator' = 'me') IN TABLE customers_index COMMENT 'this is a comment!';

$>-- order index
$>CREATE INDEX idx_orders_orderno ON TABLE orders (orderno) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD IN TABLE orders_index;

$>alter index idx_customers_id on customers rebuild ; --重建索引,产生索引表(hdfs文件)
hdfs://s100:8020/../customers_index/000000_0 --索引文件(表)
[内容如下]
1 hdfs://s100:8020/user/hive/warehouse/hive1.db/customers/customers.txt [0]
2 hdfs://s100:8020/user/hive/warehouse/hive1.db/customers/customers.txt [10]
3 hdfs://s100:8020/user/hive/warehouse/hive1.db/customers/customers.txt [20]

$>alter index idx_customers_id on customers rebuild ; --重建索引
$>alter index idx_orders_orderno on orders rebuild ; --重建索引
No001hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt0
No002hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt17
No003hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt34
No004hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt51
No005hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt67
No006hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt84
No007hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt101

$>-- 删除索引
$>drop index idx_customers_id on table customers;

$>--分区是路径，是目录，是文件逻辑隔离。有效降低查询量。

$>--桶表(bucket)，是文件。
$>--创建桶表
$>create table ... clustered by (field_name) into n buckets ;
$>create table orderitems (id int , itemname string , oid int) clustered by (oid) into 3 buckets row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile ;

1 item1 1
2 item2 1
3 item3 1
4 item4 2
5 item5 2
6 item6 2
7 item7 3
8 item8 3
9 item9 3
10 item10 3
12 item10 4
13 item10 4
14 item10 4
15 item10 4

***** 何时Hive可以避免MR操作 *****
--------------------------------------
不是mr的job就是本地模式。
1.全表扫描：没有where子句。
select * from test2 ;

2.where子句作用只有分区字段，也不需要mr.
select * from test2 where province = 'hebei' ;

3.设置hive.exec.model.local.auto=true
该属性hive会尽量使用local模式查询。

4.其余的所有查询都会转换成MR.

biwenjun999

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
hive知识点小结

hive------------------数据仓库，OLAP，分析处理，存储和分析，延迟较高。数据库: OLTP,在线事务处理，低延迟，事务支持。运行在hadoop，类SQL方法方式运行，SQL(HiveQL,HQL),MR运算。操纵的结构化数据。schema(模式，元信息存放到数据库中)，HDFS文件。derby，mysql。数据库和表都是路径。hiv
复制链接

扫一扫