hive中的基本数据类型
重点集合类型:
create table tch(
name string,
friends array,
students map<string,int>,
address structstreet:string,city:string,email:int
)
row format delimited fields terminated by ‘,’ 字段分隔符
collection items terminated by ‘_’ 集合元素分隔符
map keys terminated by ‘:’ map的kv分隔符
lines terminated by ‘\n’; 行分隔符
类型转化
Hive中 string 类型隐转是往double转
select 1+‘1’ —>2.0
select 1+‘a’ ---->null
hive中 可以使用强转
cast(值 as type)
ddl语言
4.1 库的crud
c create
CREATE DATABASE [IF NOT EXISTS] database_name --IF NOT EXISTS 如果不存在则创建
[COMMENT database_comment] --注释
[LOCATION hdfs_path] --库所对应的hdfs地址
[WITH DBPROPERTIES (property_name=property_value, …)]; --库的属性(鸡肋) 一丁点用没有
create database if not exists db_hive
comment ‘this is my first db’
with dbproperties(‘aaa’=‘bbb’,‘111’=‘222’)
create database db_hive2
location ‘/db_hive2’
create database db_hive3
location ‘/dsadsadas’
查
desc database databasename
desc database extended databasename
show databases
删
drop database basename;
drop database basename cascade; 强制删除
改 只能修改库的属性 库的名字和注释都改不了
alter database dataname set dbproperties(‘xxx’=‘xxxx’)
4.2表的增删改查
增
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name external 额外的 外部的
hive中表分为两种 外部表和内部表(管理表)
[(col_name data_type [COMMENT col_comment], …)] – 列名 类型 列注释
[COMMENT table_comment] – 表注释
[PARTITIONED BY (col_name data_type [COMMENT col_comment], …)] --创建的表为分区表
[CLUSTERED BY (col_name, col_name, …) --创建的表为分桶表
[SORTED BY (col_name [ASC|DESC], …)] INTO num_buckets BUCKETS] --桶内排序字段以及分几个桶
[ROW FORMAT row_format
[FIELDS TERMINATED BY char] --字段分隔符 如果不写有默认值 默认值是^A ctrl+v ctrl+a
[COLLECTION ITEMS TERMINATED BY char] --集合元素符 如果不写有默认值 默认值是^B ctrl+v ctrl+b
[MAP KEYS TERMINATED BY char] --map的kv符 如果不写有默认值 默认值是^c ctrl+v ctrl+c
[LINES TERMINATED BY char] --行分隔符 默认值’\n’
]
[STORED AS file_format] 表所对应的文件存储格式 textfile txt 列存
[LOCATION hdfs_path] 表所对应的存储路径
[TBLPROPERTIES (property_name=property_value, …)] --表的属性 有大用
[AS select_statement] 根据结果集创建一张表 包含表的结构和内容 不带分隔符
[LIKE table_name] 模仿一张表 只有表结构 分隔符都一样
4.2.1 内部表(删除表时一并会删除元数据和数据) 测试 中间表
4.2.2 外部表(删除表时只会删除元数据,不会删除hdfs的数据) 几乎都是外部表
4.2.3
内外部表的查看
desc formatted tablename 去查看表的类型
内外部表的区别演示
drop table tablename
内外部表是怎么转化的
alter table tablename set tblproperties(‘EXTERNAL’=‘TRUE’/‘FALSE’)
4.2.4 根据查询结构创建一张表
create table student2 as select id,name, id as ids from student
分隔符一致
create table student3 row format delimited fields terminated by ‘\t’ as select id,name, id as ids from student
4.2.5 模仿一张表
create table student4 like student;
4.3
表的查询
show tables;
desc tablename;
desc formatted tablename; 表的详情信息
4.4
表的删除
drop table tablename;
truncate table tablename; 不能清空外部表
4.5
表的修改
表的重命名
alter table oldname rename to newname
表的中列的信息修改 所有列的类型只能从小往大改 不能从大往小改
增加
ALTER TABLE table_name ADD COLUMNS (col_name data_type [COMMENT col_comment], …)
修改
ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENT col_comment]
替换
ALTER TABLE table_name replace COLUMNS (col_name data_type [COMMENT col_comment], …)
替换结果
替换之后的列数量比以前少
替换之后的列数量跟以前一样
替换之后的列数量比以前多
查询
5.1 查询语法
SELECT [ALL | DISTINCT] select_expr, select_expr, … select 后面 字段,函数,常量,表达式,子查询
FROM table_reference – 从什么表查
[WHERE where_condition] – 过滤出
[GROUP BY col_list] – 分组查询
[HAVING col_list] – 分组后过滤出
[ORDER BY col_list] – 排序 统称为hive中4个by
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY col_list]
]
[LIMIT number] – 限制输出的行数 翻页
join on 两张表横向拼接 宽表
union 竖向拼接 长表
5.2 sql语句的关键字执行先后次序
select distinct from join on where group by having order by limit union
from>join on>where>group by >having> select> distinct> order by >limit>union
limit 实现翻页 (page-1)*size,size
5.3 聚合函数
count sum avg max min 在进行计算时,都是忽略null值的
聚合函数在使用时,不能和普通字段一起查询,能和分组字段一起查
select * from emp having deptno=10
求 每个部门中的每个工种的平均薪资
select
deptno,job,avg(sal)
from
emp
group by
deptno,job
join
员工表和部门表中的部门编号相等,查询员工编号、员工名称和部门名称
select
*
from
emp e
join
dept d
on
e.deptno!=d.deptno
join 分为内连接和外连接
内连接
select
*
from
emp e
inner join
dept d
on
e.deptno =d.deptno
外连接 特点将其中一张表作为主表 主表需要保留所有数据
a left join b
a right join b
select
*
from
dept d
right join
emp e
on
e.deptno=d.deptno
full join 满外连接
select
*
from
dept d
full join
emp e
on
e.deptno=d.deptno
在mysql中实现满外连接的方式
select
*
from
emp e
left join
dept d
on
e.deptno=d.deptno
union all
select
*
from
emp e
right join
dept d
on
e.deptno=d.deptno
union 去重
union all 不去重
union 和 union all 哪个效率高
union all效率高 但是union 是往往需要的结果
多变联查
根据条件 求 员工名称 部门名称 位置名称
select
e.ename,
d.dname,
l.loc_name
from
emp e
join
dept d
on
e.deptno=d.deptno
join
location l
on
d.loc=l.loc
select
*
from
emp e
join
dept d
join
location l
on
e.deptno=d.deptno and d.loc=l.loc
七. 排序
order by 全局有序 无论现在有多少个reducer 都会强制置位1个
sort by 每个reducer 部分排序 设定多个reducer就会有多少个分区(只能控制分区内按什么排序,不能指定哪些数据进入到哪些分区 数据是随机进入的)
sort by 一般不单独使用 一般跟distribute by 连用
distribute by 表示 按xx分区 默认规则是hashpartitioner (key.hashcode()&Integer.MaxVauel)%reducenum
如果分区用的Int值,那这个规则可以直接计算。
如果分区用的不是int值,又想去精准控制分区 ,那只能自定义分区器
cluster by 是当 distribute by和sort by 字段相同时的简写
所以 他的实用性很低,并且不能去自定义排序的规则和方式 只能按照key正序排序
create table employee(
name string, --姓名
sex string, --性别
birthday string, --出生年月
hiredate string, --入职日期
job string, --岗位
salary double, --薪资
bonus double, --奖金
friends array, --朋友
children map<string,int> --孩子
)
八.函数
案例演示 每个月入职人数
select
name,
date_format(replace(hiredate,‘/’,‘-’),‘yyyy-MM’) hiredate
from
employee t1
select
t1.hiredate,
count(*)
from
(
select
name,
date_format(replace(hiredate,‘/’,‘-’),‘yyyy-MM’) hiredate
from
employee
)t1
group by
t1.hiredate
select
date_format(replace(hiredate,‘/’,‘-’),‘yyyy-MM’) hiredate,
count(*)
from
employee
group by
date_format(replace(hiredate,‘/’,‘-’),‘yyyy-MM’)
– 求每个人年龄(年+月)
select
name,
year(current_date)-year(replace(birthday,‘/’,‘-’)) sui,
month(current_date)-month(replace(birthday,‘/’,‘-’)) yue
from
employee
select
t1.name,
concat(if(t1.yue<0,t1.sui-1,t1.sui),‘岁’,if(t1.yue<0,12+t1.yue,t1.yue),‘月’) nianling
from
(
select
name,
year(current_date)-year(replace(birthday,‘/’,‘-’)) sui,
month(current_date)-month(replace(birthday,‘/’,‘-’)) yue
from
employee
)t1
3.按照薪资,奖金的和进行倒序排序,如果奖金为null,置位0
select
name,
salary,
bonus,
salary+nvl(bonus,0) s_b
from
employee
order by
s_b desc
6)查询每个岗位男女各多少人
select
job,
sex,
count(*)
from
employee
group by
job,sex
select
job,
count(*)
from
employee
where
sex=‘男’
group by
job
select
job,
count(*)
from
employee
where
sex=‘女’
group by
job
select
nvl(t1.job,t2.job) job,
nvl(t1.cn,0) male,
nvl(t2.cn,0) female
from
(
select
job,
count() cn
from
employee
where
sex=‘男’
group by
job
)t1
full join
(
select
job,
count() cn
from
employee
where
sex=‘女’
group by
job
)t2
on
t1.job=t2.job
select
job,
case sex when ‘男’ then 1 else 0 end male,
case sex when ‘女’ then 1 else 0 end female
from
employee t1
select
t1.job,
sum(male) male,
sum(female) female
from
(
select
job,
case sex when ‘男’ then 1 else 0 end male,
case sex when ‘女’ then 1 else 0 end female
from
employee
)t1
group by
t1.job
select
job,
sum(if(sex=‘男’,1,0)) male,
sum( if(sex=‘女’,1,0)) female
from
employee
group by
job
8.2 高级聚合
employee表 按照性别分组,求每个性别涉及到的工种
select
sex,
collect_list(job),
collect_set(job)
from
employee
group by
sex
每个月的入职人数以及姓名
select
date_format(replace(hiredate,‘/’,‘-’),‘yyyy-MM’) ,
size(collect_list(name)),
collect_list(name)
from
employee
group by
date_format(replace(hiredate,‘/’,‘-’),‘yyyy-MM’)
8.3 炸裂
侧写 给表侧写 给表炸开
侧写发生在 from 之后 Join 之前
lateral view udtf() tablename as colunm_name
select
category_name,
count(*)
from
movie_info
lateral view explode(split(category,‘,’)) tmp as category_name
group by
category_name
8.4 窗口函数
1.定义 窗口函数是一个高阶函数,hive中开源免费
2.函数有哪些
2.1 窗口函数
lag
lead
first_value
last_value
2.2 聚合函数
count
sum
avg
max
min
2.3 分析函数
rank
dense_rank
row_number
ntile
3.窗口函数的语法和含义
函数()over([partition by 列…][order by 列…][窗口子句])
含义
窗口函数是一行一行算的
函数表示计算的逻辑
over中所有内容表示的函数的计算范围
partition by 划分细窗口 每个窗口互相独立,窗口子句独立生效
order by
窗口子句
(ROWS | RANGE) BETWEEN (UNBOUNDED | [num]) PRECEDING AND ([num] PRECEDING | CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN CURRENT ROW AND (CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN [num] FOLLOWING AND (UNBOUNDED | [num]) FOLLOWING
-
需求
4.1 需求1
求购买过的总人次,并保留明细select
user_name,
order_date,
order_amount,
count(*)over(rows between unbounded preceding and unbounded following)
from
order_info4.2 需求2 窗口函数在sql什么位置执行 几乎等同于 聚合函数的计算位置 group by 之后 having之前
求购买过的总人数,不要求保留明细,需要保留人名select
user_name,
count(*)over(rows between unbounded preceding and unbounded following)
from
order_info
group by
user_name4.3 上面的需求书写时可以简化
select
user_name,
count(*)over()
from
order_info
group by
user_name窗口子句有默认值
When ORDER BY is specified with missing WINDOW clause, the WINDOW specification defaults to RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW.
当有order by但是没有窗口子句 范围默认上无边界到当前行When both ORDER BY and WINDOW clauses are missing, the WINDOW specification defaults to ROW BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
当没有order by也没有窗口子句 范围默认上无边界到下无边界
4.4 求每个用户消费总金额,并保留明细
partition by 划分细窗口 每个窗口互相独立,窗口子句独立生效select
user_name,
order_date,
order_amount,
sum(order_amount)over(partition by user_name)
from
order_info4.5 求每个用户每个月消费总金额,并保留明细
select
user_name,
order_date,
order_amount,
sum(order_amount)over(partition by user_name,month(order_date))
from
order_info4.6 按照时间,求累加消费金额,并保留明细
order by的含义 窗口内部按什么排序
select
user_name,
order_date,
order_amount,
sum(order_amount)over(order by order_date )
from
order_info4.7 按照时间,求每个用户的累加消费金额,并保留明细
select
user_name,
order_date,
order_amount,
sum(order_amount)over(partition by user_name order by order_date )
from
order_info4.8 按照时间,求每个用户的上一次到当前一次消费的和
select
user_name,
order_date,
order_amount,
sum(order_amount)over(partition by user_name order by order_date rows between 1 preceding and current row)
from
order_info4.9 按照时间,求每个用户的当前一次到下一次消费的和
select
user_name,
order_date,
order_amount,
sum(order_amount)over(partition by user_name order by order_date rows between current row and 1 following)
from
order_info4.10 按照时间,求每个用户的上一次到下一次消费的和
select
user_name,
order_date,
order_amount,
sum(order_amount)over(partition by user_name order by order_date rows between 1 preceding and 1 following)
from
order_info4.11 按照时间,求每个用户的上一次和下一次消费的和
窗口函数是一行行计算的
select
user_name,
order_date,
order_amount,
sum(order_amount)over(partition by user_name order by order_date rows between 1 preceding and 1 following) - order_amount
from
order_info