day01(Hive)

快乐咸鱼Y

已于 2022-07-22 23:12:09 修改

阅读量502

点赞数

分类专栏：大数据文章标签：大数据

于 2022-07-17 00:37:18 首次发布

本文链接：https://blog.csdn.net/yygyj/article/details/125827211

版权

大数据专栏收录该内容

30 篇文章 2 订阅

订阅专栏

简介

一、概述
1 Hive是Apache提供的基于Hadoop的数据仓库工具
2. Hive提供了类SQL语言来对分布式系统中的数据来进行读写以及管理
3. Hive将sQL语句在底层转化为MapReduce程序来操作数据
4. Hive的出现降低了离线分析的门槛

hive.apache.org

二、数据仓库和数据库

	数据库	数据仓库
数据量	千万级别GB	起始量TB
数据来源	来源单一，实时捕获数据	数据来源多样，存储的历史数据
事务	强调事务	弱事务，甚至有一些没有事务
面向对象	开发人员	客户、管理人员等
操作	提供的增删改查的能力	增加、查询的方法，往往不支持单条数据的改动和删除
场景	OLTP-Online Transaction Processor-联机事务处理系统	OLAP-Online Analysis Processor 联机分析处理系统
表	根据业务建立对应的表结构	主要是维度表和实时表

hive下载地址

http://archive.apache.org/dist/hive/hive-1.2.0/

启动Hadoop

[root@hadoop01 presoftware]# start-all.sh

解压

[root@hadoop01 presoftware]# tar -xvf apache-hive-1.2.0-bin.tar.gz

[root@hadoop01 apache-hive-1.2.0-bin]# ls
bin  conf  examples  hcatalog  lib  LICENSE  NOTICE  README.txt  RELEASE_NOTES.txt  scripts
[root@hadoop01 apache-hive-1.2.0-bin]# cd bin
[root@hadoop01 bin]# pwd
/home/presoftware/apache-hive-1.2.0-bin/bin
启动
[root@hadoop01 bin]# sh hive

[root@hadoop01 bin]# sh hive

Logging initialized using configuration in jar:file:/home/presoftware/apache-hive-1.2.0-bin/lib/hive-common-1.2.0.jar!/hive-log4j.properties
hive> 
    > 
    > show databases;
OK
default
Time taken: 1.362 seconds, Fetched: 1 row(s)
hive> create database hivedemo;
OK
Time taken: 0.381 seconds
hive> create database hivedemo2;
OK
Time taken: 0.15 seconds
hive> drop database hivedemo2;
Moved: 'hdfs://hadoop01:9000/user/hive/warehouse/hivedemo2.db' to trash at: hdfs://hadoop01:9000/user/root/.Trash/Current
OK
Time taken: 0.561 seconds
hive> use hivedemo;
OK
Time taken: 0.036 seconds
hive> create table student(id int,name string,age int);
OK
Time taken: 0.636 seconds
hive> show tables;
OK
student
Time taken: 0.022 seconds, Fetched: 1 row(s)
hive> insert into student values(1,'amy',15);
Time taken: 75.586 seconds

插入数据的方式

[root@hadoop01 home]# vim a.txt

1 gyj 25
2 zihui 24
3 xiaoge 66

hive> load data local inpath '/home/a.txt' into table student;

hive> select * from student;
OK
1	amy	15
NULL	NULL	NULL
NULL	NULL	NULL
NULL	NULL	NULL
Time taken: 2.121 seconds, Fetched: 4 row(s)

hive> drop table student;
Moved: 'hdfs://hadoop01:9000/user/hive/warehouse/hivedemo.db/student' to trash at: hdfs://hadoop01:9000/user/root/.Trash/Current
OK
Time taken: 2.696 seconds

hive> create table student(id int,name string,age int) row format delimited fields terminated by ' ';
OK
Time taken: 0.251 seconds
hive> insert into student values(1,'amy',15);
hive> load data local inpath '/home/a.txt' into table student;
hive> select * from student;
OK
1	amy	15
1	gyj	25
2	zihui	24
3	xiaoge	66
Time taken: 0.162 seconds, Fetched: 4 row(s)

hive> create table stu2 like student;
OK
hive> desc stu2;
OK
id                  	int                 	                    
name                	string              	                    
age                 	int

hive> insert into stu2 select * from student where id >2;

hive> select * from stu2;
OK
3	xiaoge	66
Time taken: 0.573 seconds, Fetched: 1 row(s)

监控hive
在这里插入图片描述

写出表信息到本地

[root@hadoop01 home]# mkdir hivedata

hive> insert overwrite local directory '/home/hivedata' row format delimited fields terminated by ',' select * from student;


[root@hadoop01 home]# cd hivedata/
[root@hadoop01 hivedata]# ls
000000_0

hive> create table stu3 like student;

hive> from student insert into stu2 select * where id>2 insert into stu3 select * where name='zihui';

hive> 
    > 
    > insert overwrite directory '/student' row format delimited fields terminated by ' ' select * from student;

hive> alter table student rename to stu1;
OK
Time taken: 5.334 seconds
hive> show tables;
OK
stu1
stu2
stu3
values__tmp__table__1
values__tmp__table__2
Time taken: 0.534 seconds, Fetched: 5 row(s)

hive> alter table stu1 add columns (gender string);
OK
Time taken: 1.319 seconds
hive> desc stu1;
OK
id                  	int                 	                    
name                	string              	                    
age                 	int                 	                    
gender              	string              	                    
Time taken: 0.236 seconds, Fetched: 4 row(s)
hive> select * from stu1;
OK
1	amy	15	NULL
1	gyj	25	NULL
2	zihui	24	NULL
3	xiaoge	66	NULL
Time taken: 0.267 seconds, Fetched: 4 row(s)

字段不规律问题
案例

在这里插入图片描述

建表

create table log(host string,time string,timezone string,request string,resource string,protocol string,state int) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties ("input.regex"="(.*) \-\- \\[(.*) (.*)\\] \"(.*) (.*) (.*)\" ([0-9]+) \-") stored as textfile;
hive> load data local inpath '/home/a.log' into table log;
Loading data to table hivedemo.log
Table hivedemo.log stats: [numFiles=1, totalSize=1435]
OK
Time taken: 0.381 seconds
hive> select * from log;
OK
192.168.120.23	30/Apr/2018:20:25:32	+0800	GET	/asf.avi	HTTP/1.1	304
192.168.120.23	30/Apr/2018:20:25:32	+0800	GET	/bupper.png	HTTP/1.1	304
192.168.120.23	30/Apr/2018;20:25:32	+0800	GET	/bupper.css	HTTP/1.1	304
192.168.120.23	30/Apr/2018:20:25:33	+0800	GET	/bg-button	HTTP/1.1	304

三、细节
1．在Hive中每建立一个库，都会在HDFS对应的创建一个目录
2．在Hive中没有主键的概念
3. Hive中每一张表在HDFS上都会对应一个目录
4．当利用Hive管理本地数据的时候，会将要管理的数据上传到HDFS上
5.当一张表已经创建好之后，这个表中的字段的间隔符就不能更改
6.从Hive查询数据向本地目录中写的时候只能覆盖

7 SerDe - Serializer Deserializer:
a.专门针对不规律的数据进行字段的提取，需要注意的是在提取的时候
需要指定提取的正则表达式Ⅰ
b.在提取的时候，正则表达式中的每一个捕获组对应Hive表中的一个字段

四、表结构
1、内部表:在Hive中之间创建的存储在/user/hive/warehouse路径下的表，在Hive中删除内部表的时候，对应的目录也会被删除掉
2.外部表:利用Hive的表结构管理HDFS上已经存在的表。创建外部表的时候指定的路径必须是一个目录，无法指定到具体的文件。在Hive中删除外部表的时候，对应的目录不会被删除
3．分区表:
a.用于对数据进行分类
b.分区表的字段在原始数据中是不存在c.每一个分区对应一个目录
d．如果将分区作为查询条件，那么可以在很大程度上提高查询效率;但是如果跨分区查询，此时效率反而会降低|
e.动态分区

外部表创建
在这里插入图片描述

hive> create  external table  score(name string,chinese int,math int,english int) row format delimited fields terminated by ' ' location '/score';
hive> select * from score;
OK

Bob	90	64	92
Alex	64	63	68
Grace	57	86	24
Henry	39	79	78
Adair	88	82	64
Chad	66	74	37
Colin	64	86	74
Eden	71	85	43
Grover	99	86	43

分区表

[root@hadoop01 hivedata]# vim cnperson.txt
[root@hadoop01 hivedata]# vim usperson.txt

hive> create table person (id int,name string) partitioned by (country string) row format delimited fields terminated by ' '; 
hive> load data local inpath '/home/hivedata/cnperson.txt' into table person partition(country='cn');
hive> load data local inpath '/home/hivedata/usperson.txt' into table person partition(country='us');
hive> select * from person;
OK
1	紫慧	cn
2	张小仙	cn
3	佩奇	cn
1	詹姆斯	us
2	特朗普	us
3	奥巴马	us
hive> select * from person where country='cn';
OK
1	紫慧	cn
2	张小仙	cn
3	佩奇	cn
Time taken: 0.617 seconds, Fetched: 3 row(s)

将手动创建的分区目录添加为分区到hive表
在这里插入图片描述

两种方式
一种是主动添加一种是修复
hive> alter table person add partition(country='jp') location '/user/hive/warehouse/hivedemo.db/person/country=jp';
OK
Time taken: 0.715 seconds
hive> select * from person where country='jp';
OK
1	紫慧	jp
2	张小仙	jp
3	佩奇	jp
Time taken: 0.377 seconds, Fetched: 3 row(s)
hive> msck repair table person;
OK
Partitions not in metastore:	person:country=kn
Repair: Added partition to metastore person:country=kn
Time taken: 0.49 seconds, Fetched: 2 row(s)
hive> select * from person where country='kn';
OK
1	紫慧	kn
2	张小仙	kn
3	佩奇	kn
Time taken: 0.322 seconds, Fetched: 3 row(s)

删除分区和修改分区
hive>  alter table person drop partition(country='kn');
Moved: 'hdfs://hadoop01:9000/user/hive/warehouse/hivedemo.db/person/country=kn' to trash at: hdfs://hadoop01:9000/user/root/.Trash/Current
Dropped the partition country=kn
OK
Time taken: 1.269 seconds
hive> alter table person partition(country='jp') rename to partition(country='kn');
OK
Time taken: 0.399 seconds

[root@hadoop01 hivedata]# vim person.txt

1 amy cn
2 sam us
3 alex en
4 bob cn
5 helen cn
6 grace en
7 jack jp

未分区表的数据插入到分区表中

hive> create table p2(id int,name string,country string) row format delimited fields terminated by ' ';
OK
Time taken: 1.051 seconds
hive> 
    > 
    > load data local inpath '/home/hivedata/person.txt' into table p2;
Loading data to table default.p2
Table default.p2 stats: [numFiles=1, totalSize=69]
OK
Time taken: 1.375 seconds
hive> select * from p2;
OK
1	amy	cn
2	sam	us
3	alex	en
4	bob	cn
5	helen	cn
6	grace	en
7	jack	jp
Time taken: 0.422 seconds, Fetched: 7 row(s)

开启动态IP
hive> set hive.exec.dynamic.partition=true;
设置为非严格模式
hive> set hive.exec.dynamic.partition.mode=nonstrict;

正式插入

hive> insert into table person partition(country) select * from p2 distribute by (country);

4.分桶表:
a.将数据分发到不同的bucket(桶)中，主要的作用是进行抽样工
b.分桶表在使用的时候不能是外部表，也不能使用load data的方式来加载数据，只能使用insert的方式来插入数据

建表
create table p3(id int,name string,country string) clustered by (name) into 6 buckets row format delimited fields terminated by ' ';
insert into table p3 select * from p2;

在这里插入图片描述

注意：
如果分了6桶
5 out of 1,错误，5往后没桶了
select * from p3 tablesample(bucket 5 out of 1 on name);

select * from p3 tablesample(bucket 1  out of 1 on name);

join操作
源文件
order.txt

1001 20170710 4 2
1002 20170710 3 100
1083 20170710 2 40
1004 20170711 2 23
1005 20170823 4 55
1005 20170823 4 55
1006 20170824 3 20
1007 20170825 2 3
1009 20170912 2 10
1010 26170913 2 2
1011 20170914 3 14
1612 20170915 3 18

product.txt

1 chuizi 3999
2 huawei 3999
3 xiaomi 2999
4 apple 5999

建表（hdfs根目录下有orders和products文件夹）
create external table orders(oid int,time string,pid int,num int) row format delimited fields terminated by ' ' location '/orders';

create external table products(pid int,name string,price double) row format delimited fields terminated by ' ' location '/products';

hive> select o.oid,o.num * p.price from orders o join products p on o.pid=p.pid;(内联)
hive> select o.oid,o.num * p.price from orders o inner join products p on o.pid=p.pid;(内联)

hive> select o.oid,o.num * p.price from orders o inner join products p on o.pid=p.pid;(内联)
1001	11998.0
1002	299900.0
1083	159960.0
1004	91977.0
1005	329945.0
1005	329945.0
1006	59980.0
1007	11997.0
1009	39990.0
1010	7998.0
1011	41986.0
1612	53982.0

hive> select o.oid,o.num * p.price from orders o left join products p on o.pid=p.pid;
1001	11998.0
1002	299900.0
1083	159960.0
1004	91977.0
1005	329945.0
1005	329945.0
1006	59980.0
1007	11997.0
1009	39990.0
1010	7998.0
1011	41986.0
1612	53982.0

NULL	NULL
1083	159960.0
1004	91977.0
1007	11997.0
1009	39990.0
1010	7998.0
1002	299900.0
1006	59980.0
1011	41986.0
1612	53982.0
1001	11998.0
1005	329945.0
1005	329945.0
Time taken: 69.597 seconds, Fetched: 13 row(s)
hive> select o.oid,o.num * p.price from orders o right join products p on o.pid=p.pid;

hive> select * from orders o left semi join products p on o.pid = p.pid;

1001	20170710	4	2
1002	20170710	3	100
1083	20170710	2	40
1004	20170711	2	23
1005	20170823	4	55
1005	20170823	4	55
1006	20170824	3	20
1007	20170825	2	3
1009	20170912	2	10
1010	26170913	2	2
1011	20170914	3	14
1612	20170915	3	18

hive> select * from products p left semi join orders o on o.pid = p.pid;
2	huawei	3999.0
3	xiaomi	2999.0
4	apple	5999.0

hive基本类型

Hive中的类型	Java中的类型
tinyint	byte
smallint	short
int	int
bigint	long
boolean	boolean
float	float
double	double
string	string
timestamp	TimeStamp
binary	byte[]

复杂类型
数组类型 array

[root@hadoop01 hivedata]# vim num.txt

12,31,33 43,65,67
23,43,55,6 45,233,43
21,43,23,56,76 87,45,666

建表，加载数据，查询

hive> create table num(n1 array<int>,n2 array<int>) row format delimited fields terminated by ' ' collection items terminated by ',';
OK
Time taken: 0.131 seconds
hive> load data local inpath '/home/hivedata/num.txt' into table num;
Loading data to table hivedemo.num
Table hivedemo.num stats: [numFiles=1, totalSize=64]
OK
Time taken: 0.277 seconds
hive> select * from num;
OK
[12,31,33]	[43,65,67]
[23,43,55,6]	[45,233,43]
[21,43,23,56,76]	[87,45,666]
Time taken: 0.147 seconds, Fetched: 3 row(s)
hive> 

hive> select n1[0] from num;
OK
12
23
21
Time taken: 0.392 seconds, Fetched: 3 row(s)

map类型

[root@hadoop01 hivedata]# vim map.txt

1 tom:class1
2 amy:class2
3 alex:class1
4 helle:class2

建表。插入数据。查询

hive> create table maps (id int,info map<string,string>) row format delimited fields terminated by ' ' map keys terminated by ':';
OK
Time taken: 0.313 seconds
hive> load data local inpath '/home/hivedata/map.txt' into table maps;
Loading data to table hivedemo.maps
Table hivedemo.maps stats: [numFiles=1, totalSize=55]
OK
Time taken: 0.452 seconds
hive> select * from maps;
OK
1	{"tom":"class1"}
2	{"amy":"class2"}
3	{"alex":"class1"}
4	{"helle":"class2"}
Time taken: 0.08 seconds, Fetched: 4 row(s)
hive> select info['tom'] from maps;
OK
class1
NULL
NULL
NULL


hive> select info['tom'] from maps where info['tom'] is not null;
OK
class1
Time taken: 0.393 seconds, Fetched: 1 row(s)

struct类型

hive> create external table scores(info struct<name:string,chinese:int,math:int,english:int>) row format delimited fields terminated by ',' collection items terminated by ' ' location '/score';
OK
Time taken: 0.253 seconds

hive> select * from scores;
OK
{"name":"Bob","chinese":90,"math":64,"english":92}
{"name":"Alex","chinese":64,"math":63,"english":68}
{"name":"Grace","chinese":57,"math":86,"english":24}
{"name":"Henry","chinese":39,"math":79,"english":78}
{"name":"Adair","chinese":88,"math":82,"english":64}
{"name":"Chad","chinese":66,"math":74,"english":37}
{"name":"Colin","chinese":64,"math":86,"english":74}
{"name":"Eden","chinese":71,"math":85,"english":43}
{"name":"Grover","chinese":99,"math":86,"english":43}
Time taken: 0.187 seconds, Fetched: 9 row(s)

hive> select info.name from scores;
OK
Bob
Alex
Grace
Henry
Adair
Chad
Colin
Eden
Grover
Time taken: 0.233 seconds, Fetched: 9 row(s)

函数

字符函数

hive> select length('abc');
OK
3
hive> select reverse('abc');
OK
cba

hive> select concat('abc','def');
hive> select concat('abc','def');
OK
abcdef

hive> select concat_ws('#','aaa','bbb');
OK
aaa#bbb


hive> select upper('abc');
OK
ABC

hive> select trim(' ad ff ');
OK
ad ff

hive> select regexp_replace('aaa','a','6666');
OK
666666666666

hive> select regexp_extract('abc232DFG','([a-z]+)([0-9]+)([A-Z]+)',1);
OK
abc

hive> select regexp_extract('abc232DFG','([a-z]+)([0-9]+)([A-Z]+)',0);
OK
abc232DFG

 
hive> select space(5);
OK
    
hive> select repeat('qwe',5);
OK
qweqweqweqweqwe

hive> select ascii('a');
OK
97

hive> select  split('ad ff hh',' ');
OK
["ad","ff","hh"]

 hive> select explode(split('ad cb dd',' '));
OK
ad
cb
dd

hive> create external table words(w string) row format delimited fields terminated by ','location '/words';
OK

单词计数
案例数据
hello world
hello hadoop
hello java
hello hangzhou
hangzhou hello
world good




建表
hive> create external table words(w string) row format delimited fields terminated by ','location '/words';
OK


hive> select * from words;
OK
hello world
hello hadoop
hello java
hello hangzhou
hangzhou hello
world good


hive> select split(w,' ') from words;
OK
["hello","world"]
["hello","hadoop"]
["hello","java"]
["hello","hangzhou"]
["hangzhou","hello"]
["world","good"]


hive> select explode(split(w,' ')) from words;
OK
hello
world
hello
hadoop
hello
java
hello
hangzhou
hangzhou
hello
world
good


hive> select ww,count(ww) from (select explode(sploit(w,' ')) www from words) ws group by ww;
OK
good	1
hadoop	1
hangzhou	2
hello	5
java	1
world	2