HIVE基础常用
- 表的操作 ddl
--建表
create table if not exists dw.dwd_member_info
(
member_id string,
name string,
age int
) stored as rcfile;
--仅仅复制表结构
create table if not exists dw.dwd_member_info_bak
like dw.dwd_member_info;
--复制数据
drop table if exists dw.dwd_member_info_bak;
create table if not exists dw.dwd_member_info_bak
stored as rcfile as
select * from dw.dwd_member_info;
--重命名表
alter table dw.dwd_member_info_bak rename to dw.dwd_member_info_bak2;
--删除
drop table if exists dw.dwd_member_info_bak2;
--清空表
truncate table dw.dwd_member_info;
--加字段
alter table dw.dwd_member_info add columns(sal double);
alter table dw.dwd_member_info change column sal sal int after name ;
--查看建表语句
show create table dw.dwd_member_info;
--查看表信息
desc dw.dwd_member_info;
desc formatted dw.dwd_member_info;
--创建分区表
CREATE TABLE `dw.dwd_member_info_part`(
`member_id` string,
`name` string,
`sal` int,
`age` int)
partitioned by (statis_date string)
stored as rcfile;
--插入分区数据
insert into dw.dwd_member_info_part partition(statis_date ='20210323') values ('20080501111','wang',20000,30);
--查看分区信息
show partitions dw.dwd_member_info_part;
desc dw.dwd_member_info_part partition(statis_date=20210323);
desc formatted dw.dwd_member_info_part partition(statis_date=20210323);
--批量删除分区数据
alter table dw.dwd_member_info_part drop partition(statis_date>=20200101,statis_date<=20201231);
- 数据的操作 dml
--增删改查
insert into table dw.dwd_member_info values('20080501111','wang',20000,30);
insert into table dw.dwd_member_info as select * from dw.dwd_member_info_others;
insert overwrite table dw.dwd_member_info as select * from dw.dwd_member_info_others; --覆盖数据
--分区插入
insert into table dw.dwd_member_info_part partition(statis_date='20210323')
select * from dw.dwd_member_info ;
--动态分区插入
hive.exec.dynamic.partition =true; --否开启动态分区功能
hive.exec.max.dynamic.partitions.pernode =100; --动态分区最大数量
set hive.exec.dynamic.partition.mode=nonstrict;--允许所有的分区字段都可以使用动态分区
insert into table dw.dwd_member_info_part partition(statis_date)
select * from dw.dwd_member_info_part;
truncate table dw.dwd_member_info;
- 其他操作
--查看数据库
show databases; --like '*hive*'
--查看表
show tables; --like '*member_info*'
--查看hive自带函数
show functions; --like '*day*'
--查看函数用法
desc function data_add;
--临时表,当前会话有效
create table temporary tmp.session_table;
--分组+排序
select a.*, row_number() over(partition by a.age order by member_id) as rn
from dw.dwd_member_info a;
--or
select a.*, row_number() over(distribute by a.age sort by member_id) as rn
from dw.dwd_member_info a;
--随机100万
select * from dw.dwd_member_info a order by rand() limit 1000000;
--数据随机分三组
select aa.*, pmod(a.rn / 3) + 1 as group_no
from (select a.*, row_number() over(order by member_id)
from dw.dwd_member_info a) aa;
--解析json
--行列转换
--三种特殊类型
--mocro临时函数模式
--生成多行+序列 类似Oracle的connect by
select posexplode(split(repeat('a',10),'a'));
HIVE任务优化
- 案例1:
一张几亿的会员信息(dw.dwd_member_info),需要通过身份证提出几百万指定的数据(temp.asign_info),结果随机100万,其中会员信息有一亿条信息缺失身份证信息,通过正常过滤方式任务会有数据倾斜,一亿身份证为空的数据在一个节点计算,计算不出结果
select a.*
from dw.dwd_member_info a
left join temp.asign_info b
on a.idcard = b.idcard
where b.idcard is null
sort by rand()
limit 1000000;
优化方案:
身份证为空的数据单独处理,确保集群资源分配平均
select aa.*
from (
--为空的单独处理
select a.*
from dw.dwd_member_info a
where nvl(a.idcard, '') = ''
union all
--不为空的单独处理
select a.*
from dw.dwd_member_info a
left join temp.asign_info b
on a.idcard = b.idcard
where b.idcard is null
and nvl(a.idcard, '') <> '') aa
sort by rand() limit 1000000;
案例2:
表空间过大。压缩方案
set mapreduce.map.output.compress=true;
insert overwrite table dw.dwd_member_info
select * from dw.dwd_member_info;