HIVE总结-CSDN博客

本文链接：https://blog.csdn.net/weixin_44527643/article/details/135776873

Hadoop：
hdfs：存储
client：客户端
1. 提交任务
2. 切分文件成block块（128M）
3. 与namenode和datanode通讯
namenode: 管理元数据，保证客户端可以拿到一个完整的文件
1. 接收客户端的请求
2. 与datanode通讯
datanode: 存储数据
1. 与client通讯
2. 备份数据
snn: 辅助namenode
1. 不是顶替namenode
2. 辅助管理元数据

yarn：资源调度
	resourcemanager：
		1. 管理 
	nodemanager：
		2. 打工
		
数据库与数据仓库的区别
	数据库： 存储数据直接与用户打交道，反应要快，时效性要高
	数据仓库： 存储的历史数据，用于数据分析
	
数据仓库的分层：
	ods :源数据层，存储的是直接从外面接入的数据，例如：mysql，日志数据，excel等等
	dw : 数据仓库层，经过ods层预处理之后的数据，例如：过滤、日期处理、清洗等等操作之后的结果数据
	APP：应用层，结果层，存储的是结果数据，就是从dw读取后经过统计分析得到的结果
	
Hive的特点：
	1. 适合做离线，因为底层是执行的MapReduce计算引擎，因为MapReduce的中间结果写hdfs
	2. 做数据分析，数据处理
	3. 将文件数据映射成表
	4. 数据存储在hdfs上面
	
	
数据操作：
	create database if not exists bigdata_59 ;
	create database bigdata_59 location '/bigdata/bigdata_59';指明一个同名数据库的文件夹，否则看不见，但是数据库存在
	desc database bigdata_59;查看数据库信息，例如所在hdfs的位置
	drop database bigdata_59 cascade; 删除数据库
	
表操作： 
	create table stu(name string,age int);
	show tables;
	insert into stu values('xiuer2',22);---不好的
	desc stu;
	desc formatted stu;查看一个表详细信息，例如表的所在位置
	drop table stu;
	
第二种方式加载数据到表：
	第一步：create table stu2(name string,age int) row format delimited fields terminated by '\t';
	第二步：准备数据，将数据放到Linux环境下
	第三步：加载数据到hive
		 load data local inpath '/export/data/syz/stu2.txt' into table stu2;
			local: 当你的文件在Linux文件系统就加local，如果是在hdfs文件系统就不加local
			
	hdfs加载数据到hive表
		create table stu3(name string,age int) row format delimited fields terminated by '\t' ;
		load data inpath '/bigdata_59' into table stu3;
	注意：
		local本地加载数据到表里面，本地Linux文件系统的文件还存在
		hdfs加载数据到表里面，hdfs原来的文件会被移动到表所在的文件夹
		
		
回顾：
	数据库
		create database if not exists bigdata;
		create database bigdata_59 location '/bigdata_59' ;
		desc database bigdata; 
		drop database bigdata cascade;
	表 
		create table stu(name string,sex int,age int);
	插入数据，加载数据
		insert into stu values('张三',1,20);
		本地加载：Linux环境
		create table stu(name string,sex int,age int) row format delimited fileds terminated by ','	;
		load data local inpath '/export/data/syz/stu' into table stu;
		hdfs加载：需要将数据上传到hdfs路径
		create table stu4(name string,sex int,age int) row format delimited fields terminated by ',';
		上传到hdfs： hdfs dfs  -put stu4.txt /bigdata_59
		load data inpath '/bigdata_59' into table stu4;
	总结：内部表删除表的时候，数据和表都不见了
	
外部表： external
		create external  table stu5(name string,sex int,age int) row format delimited fields terminated by ',';
		load data local inpath '/export/data/syz/stu4.txt' into table stu5;
		
		hdfs： 
		create external table stu6(name string,sex int,age int) row format delimited fields terminated by ',';
		load data inpath '/bigdata_59/stu4.txt' into table stu6;
		
		create external table stu7(name string,sex int,age int) row format delimited fields terminated by ',';
	总结：外部表删除表的时候，数据不会删掉，但是表删掉了。保证数据不会因为误操作而删除数据
	
分区表：
	一级分区表：
		create table score(sid string,cid string,score int) partitioned by (day string) row format delimited fields terminated by ',';
		load data local inpath '/export/data/hivedatas/score.txt' into table score partition (day='2020-01-01');
		insert into score partition (day='2020-01-02') values('xiuer','gui',90) ;
		insert overwrite table score partition (day='2020-01-02') values('xiuer','gui',120) ;
		
	多级分区表：
		create table score2(sid string,cid string,score int) partitioned by (year string,month string,day string) row format delimited fields terminated by ',';
		
		load data local inpath '/export/data/hivedatas/score.txt' into table score2 partition (year='2020',month='01',day='01');
		
		insert into score2 partition (year='2020',month='02',day='01') values('xiuer','gui',90) ;
		
	show partitions score2;查看分区
	alter table score2 add partition(year='2020',month='03',day='01');添加分区
	alter table score2 drop partition(year='2020',month='03',day='01');删除分区
	
	总结： 
		1. 分区表里面的数据插入的时候需要指定分区
		2. into是不断插入新的数据
		3. overwrite是先删除指定分区的数据，再写入新的数据
		4. 分区是一个文件夹
		5. 多级分区表的分区是按照一级包含二级，二级包含三级的关系生成
		
满连接：
	full  join 
create external table student(name string,cid int) row format delimited fields terminated by '\t';
load data local inpath '/export/data/syz/student.txt' into table student;
create table course (cid int ,cname string) row format delimited fields terminated by ',';
load data local inpath '/export/data/hivedatas/course.txt' into table course;
select * from student t1 full join course t2 on t1.cid=t2.cid;


hive的内置函数：
	round:取小数的位数
	substr,substring：截取字符串的固定内容
		substr(c1,c2,c3):
			c1:代表输入的字符串
			c2:代表从字符串的哪个位置开始，从1开始数据
			c3:截取的长度
	concat: 字符串发拼接
	concat_ws("-","",""):select concat_ws("-","bbb","3443",'423423');
	date_format('2020-1-1 1:1:1','yyyy-MM-dd HH:mm:ss');
	case when
	开窗函数
		row_number() over()
		rank() over() 
		dense_rank() over()

select * from (select *,row_number() over(partition by user_id order by pv desc) row1 from user_access ) t1 where t1.row1<=3

总结： 
	数据库
		create database if not exists bigdata_59;
		create database bigdata_59 location '/bigdata_59';不建议使用，不方便管理
		desc database bigdata_59; 
		drop database bigdata_59 cascade;
	表 
		use bidata_59;
		create table stu(id int,name string);
		insert insert into stu values(1,'');
	外部表
		create external table stu1(id int,name string) row format delimited fields terminated by ',';
		load data local inpath '/export/data/syz/stu.txt' into table stu1;
		hdfs dfs -put stu.txt /bigdata_59/ 
		load data  inpath '/bigdata_59//stu.txt' into table stu1;
	内部表 
		create  table stu2(id int,name string) row format delimited fields terminated by ',';
		load data local inpath '/export/data/syz/stu.txt' into table stu2;
		hdfs dfs -put stu.txt /bigdata_59/ 
		load data  inpath '/bigdata_59//stu.txt' into table stu2;
		总结： 
			删除内部表的时候，数据会被删除掉
			删除外部表的时候，数据不会被删除掉
	
	分区表
		一级分区 
			create table stu3(id int,name string) partitioned by (day string) row format delimited fields terminated by ',';
			insert into stu3 partition (day='2020-10-01') values(1,'xiuer');
			load data inpath '/bigdata_59/' into table stu3 partition (day='2020-1-1')
		多级分区 
		create table stu4(id int,name string) partitioned by (year string,month string,day string) row format delimited fields terminated by ',';
		insert overwrite table stu4 partition (year='2020' , month='10',day='01') values(1,'xiuer');
		show partitions stu4;
		alter table stu4 add partition(day='2020-12-12');
		alter table stu4 drop partition(day='2020-12-12');
	zeppeline操作hive 
	查询 
	内置函数
		round : 取小数的位数 
		substring,substr：截取字符串 
		concat:拼接字符串
		concat_ws:拼接字符串并且制定分隔符 concat_ws('-','a','b','c')
		year 
		month 
		day 
		hour 
		date_format:日期格式化函数 
		case when 
		窗口函数
			row_number() 
			rank() 
			dense_rank()
		
		
		
		自定义udf函数：面试必问