文章目录
I know, i know
地球另一端有你陪我
一、分区
1、默认分区
实际上是在表的目录下在以分区命名,建子目录
作用:能够避免全表扫描,减少MapReduce处理的数据量,提高效率
需要在建表时加上分区字段,通常按日期、地域分区,一般不超过三级目录,避免 task 过多
本质上也是一个字段,所以不能和普通字段重复
1、建立分区表
create external table students_pt1
(
id bigint,
name string,
age int,
gender string,
clazz string
)
/ 分区字段在这里,即 pt
PARTITIONED BY(pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
2、手动增加、删除分区(子目录)
/ 添加分区(目录),名称为 20210904
alter table students_pt1 add partition(pt='20210904');
/ 删除名称为 20210904 的分区(目录)
alter table students_pt drop partition(pt='20210904');
3、查询表的所有分区
/ 会从元数据中获取信息
show partitions students_pt;
4、向分区(子目录)中手动插入数据
会自动创建不存在的分区,注入(insert)的话,会在数据中自动添加一列分区信息
insert into table students_pt partition(pt='20210902')
select * from students;
load data local inpath '/usr/local/soft/data/students.txt'
into table students_pt partition(pt='20210902');
5、 查询分区数据
分区字段当做普通字段使用
select count(*) from students_pt where pt='20210101';
2、动态分区
hive 能够自动识别数据中的设置好的分区字段
自动进行分区(子目录)的创建,和数据划分
需要使用 insert 加载数据
1、开启动态分区
/ 表示开启动态分区
set hive.exec.dynamic.partition=true;
/ 表示动态分区模式:strict(需要配合静态分区一起使用)、nostrict
set hive.exec.dynamic.partition.mode=nostrict;
/ 最大分区数
set hive.exec.max.dynamic.partitions.pernode=1000;
2、建一个原始表存数据,再建一个分区表造
create table students_dt
(
id bigint,
name string,
age int,
gender string,
clazz string,
dt string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
create table students_dt_p
(
id bigint,
name string,
age int,
gender string,
clazz string
)
PARTITIONED BY(dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
3、使用动态分区插入数据
/ 原始数据载入
load data local inpath '/usr/local/data/students_dt.txt'
into table students_dt;
/ hive 识别分区字段锁死从最后向前读
insert into table students_dt2 partition(dt)
select id,name,age,gender,clazz,dt from students_dt;
/ 比如下面这条语句会使用age作为分区字段,而不会使用student_dt中的dt作为分区字段
insert into table students_dt2 partition(dt) select
id,name,age,gender,dt,age from students_dt;
3、多级分区(多级子目录)
create table students_year_month_pt
(
id bigint,
name string,
age int,
gender string,
clazz string
)
PARTITIONED BY(year string,month string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
insert into table students_year_month_pt partition(year,month)
select id,name,age,gender,clazz,year,month from students_year_month;
二、分桶
分区是创建子目录保存数据,分桶是以普通字段作为标准,将数据切分为子文件
本质上是通过 MapReduce 中的 reduce task 读取不同的 key 来将数据划分
所以可能会出现部分桶出现过多数据,而部分桶没有数据,无法绝对平均
1、开启分桶
hive 默认关闭分桶
set hive.enforce.bucketing=true;
2、建立分桶表
以普通字段作为分桶标准
create table students_buck
(
id bigint,
name string,
age int,
gender string,
clazz string
)
CLUSTERED BY (clazz) into 12 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
3、往分桶里倒垃圾
需要使用 insert
insert into students_buks select * from students;
三、Hive JDBC
java 连接 hive
本质上是连接 hadoop ,所以操作都差不多
1、先在 linux 中打开服务
hiveserver2 &
package day46;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
public class HiveTest {
public static void main(String[] args) throws Exception{
Class.forName("org.apache.hive.jdbc.HiveDriver");
Connection conn
= DriverManager.getConnection(
//此处要写明库名
"jdbc:hive2://master:10000/fghdata");
String sql = "select * from students";
PreparedStatement ps = conn.prepareStatement(sql);
ResultSet rs = ps.executeQuery();
while(rs.next()){
String name = rs.getString("name");
System.out.println(name);
}
rs.close();
ps.close();
conn.close();
}
}
四、一些数据类型
1、基本数据类型
整型:TINYINT、SMALLINT、INT、BIGINT
— 对应 byte、short、int、long
浮点:FLOAT、DOUBLE
布尔类型:BOOL (False/True)
字符串:STRING
2、时间类型
时间戳 timestamp
日期 date
1 时间戳转换为对应格式的时间格式
select from_unixtime(1630915221,'yyyy年MM月dd日 HH时mm分ss秒')
2 时间字符串转换为时间戳
/ 文字必须对应相同
select unix_timestamp('2021年09月06日 16时00分21秒'
,'yyyy年MM月dd日 HH时mm分ss秒');
select unix_timestamp('2021-01-14 14:24:57.200');
以上两个可以组合使用,将字符串转换为时间格式
from_unixtime(unix_timestamp('2021-01-14 14:24:57.200','yyyy-MM-dd hh:mm:ss'),'yyyy-MM-dd HH:mm:ss') as end_date
3 时间做差
做差只能精确到时间,无法到小时等等
select datediff('2018-09-02','2018-09-01')
小时等的可以尝试转换为时间戳,再除60
3、复杂数据类型
1 array
定义一个字符串数组,数组内元素用 ‘,’ 隔开
create table array(
name string,
weight array<string>
)row format delimited fields terminated by ' '
COLLECTION ITEMS terminated by ',';
输入的数据 be like
zs 110,120,130
ls 220,240,290
查询语句
select name,weight[0] from testArray;
2 map
create table map(
name string,
score map<string,int>
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
输入的数据 be like
zs 语文:110,数学:120,英语:130
ls 语文:120,数学:110,英语:120
查询语句
select name,score['语文'] from map;
3 struct
create table struct(
name string,
state struct<level:int,atk:int,hp:int>
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
COLLECTION ITEMS TERMINATED BY ',';
输入的数据 be like
ralts 5,9,15
piplup 5,10,18
查询语句
select name,state.level,state.atk from struct;
4、行、列转换
1 行转列
create table testarray(
name string,
weight array<string>
)row format delimited fields terminated by ' '
collection items terminated by ',';
数据如下,写在一个文件中
a "150","170","180"
b "150","180","190"
// 导入
load data local inpath '/usr/local/data/test' into table testarray
侧视表 lateral view
类似笛卡尔积(join),将 [150,170,180] 与原表做匹配,即 a
select name,col1 from testarray
lateral view explode(weight) t1 as col1;
a 150
a 170
a 180
b 150
b 180
b 190
2 列转行
create table testarray2(
name string,
num int
)row format delimited
fields terminated by ' ';
a 150
a 120
a 110
b 140
b 110
b 180
MapReduce 按照 key 直接丢到对应的 reduce task
select name,collect_list(num)
from testarray2 group by name;
a [150,120,110]
b [140,110,180]
五、开窗函数
在使用 sql 查询语句的时候,经常会用到聚合函数(group by 等)
而所得到的数据中,只会显示一行数据,往往会丢失我们真正需要的数据
为此,出现了开窗函数,保证用户可以在保留原数据的基础上,使用函数
1、统计连续登陆最大天数
建表
create table login
(
id string,
day string
) row format delimited fields terminated by ',';
数据(千万不要加引号,会导致计算识别错误)
001,2020-04-20
001,2020-04-21
001,2020-04-25
001,2020-04-26
001,2020-05-10
001,2020-05-11
001,2020-05-12
001,2020-05-30
001,2020-06-17
001,2020-06-25
002,2020-10-20
002,2020-10-21
002,2020-10-22
002,2020-10-23
002,2020-11-10
002,2020-11-11
002,2020-11-17
002,2020-11-30
002,2020-12-17
002,2020-12-25
003,2020-10-01
003,2020-10-02
003,2020-10-11
003,2020-10-31
003,2020-11-01
003,2020-11-02
003,2020-11-03
003,2020-11-04
003,2020-12-17
003,2020-12-18
1 按照行号添加行数
select id,day,row_number() over(partition by id order by day) as row1
from login;
2 行号和日期做差(如果是连续的,差会连续相同,计算的逻辑基础在这里)
select id,day,row1,date_sub(day,row1) as origin
from
(select id,day,row_number() over(partition by id order by day) as row1
from login) as t1;
3 id进行分组,统计次数
select id,count(origin) as continuity
from
(select id,day,row1,date_sub(day,row1) as origin
from
(select
id,day,row_number() over(partition by id order by day) as row1
from login)as t1
)as t2
group by id,origin;
4 取出最大值,即最大连续登陆天数
select id,max(continuity)
from
(select id,count(origin) as continuity
from
(select id,day,row1,date_sub(day,row1) as origin
from
(select
id,day,row_number() over(partition by id order by day) as row1
from login) as t1
) as t2
group by id,origin
)as t3
group by id;
1、top N
1 关联学生表和成绩表,获得基础信息
select students.*,sum from students
inner join(
select students.id,sum(score) as sum from students
inner join score
on students.id=score.id
group by students.id) as t1
on students.id=t1.id;
2 加上列,获得排名
select
students.*,sum,row_number() over(partition by clazz order by sum) as rank
from students
inner join(
select students.id,sum(score) as sum from students
inner join score
on students.id=score.id
group by students.id) as t1
on students.id=t1.id;
3 不能直接使用 having,需要再嵌套一次查询
select * from(
select
students.*,sum,row_number() over(partition by clazz order by sum) as rank
from students
inner join(
select students.id,sum(score) as sum from students
inner join score
on students.id=score.id
group by students.id) as t1
on students.id=t1.id) as t2
where rank<=3;
总结
1、查询后台服务
ps aux | grep hive/redis...
2、包导不进去,就退出重进,完了刷新一下