工作实践-hive（建设中）

最新推荐文章于 2024-11-11 20:35:54 发布

置顶蓝色蛋饼

最新推荐文章于 2024-11-11 20:35:54 发布

阅读量115

点赞数

分类专栏： HIVE笔记文章标签：大数据 hive hql

本文链接：https://blog.csdn.net/dd215130268/article/details/115107834

版权

HIVE笔记专栏收录该内容

1 篇文章 0 订阅

订阅专栏

HIVE基础常用

- 表的操作 ddl

   --建表
create table if not exists dw.dwd_member_info
(
   member_id string,
   name string,
   age int
) stored as rcfile;
--仅仅复制表结构
create table if not exists dw.dwd_member_info_bak
 like dw.dwd_member_info;
--复制数据
drop table if exists dw.dwd_member_info_bak;
create table if not exists dw.dwd_member_info_bak
 stored as rcfile as
select * from dw.dwd_member_info;
--重命名表
alter table dw.dwd_member_info_bak rename to dw.dwd_member_info_bak2;
--删除
drop table if exists dw.dwd_member_info_bak2;
--清空表
truncate table dw.dwd_member_info;
--加字段
alter table dw.dwd_member_info add columns(sal double);
alter table dw.dwd_member_info change column sal sal int after name ;
--查看建表语句
show create table dw.dwd_member_info;
--查看表信息
desc dw.dwd_member_info; 
desc formatted dw.dwd_member_info;

--创建分区表
CREATE TABLE `dw.dwd_member_info_part`(
  `member_id` string, 
  `name` string, 
  `sal` int, 
  `age` int)
 partitioned by (statis_date string)
 stored as rcfile;
--插入分区数据
insert into dw.dwd_member_info_part partition(statis_date ='20210323') values ('20080501111','wang',20000,30);
--查看分区信息
show partitions dw.dwd_member_info_part;
desc dw.dwd_member_info_part partition(statis_date=20210323); 
desc formatted dw.dwd_member_info_part partition(statis_date=20210323);
--批量删除分区数据
alter table dw.dwd_member_info_part drop partition(statis_date>=20200101,statis_date<=20201231);

- 数据的操作 dml

--增删改查
insert into table dw.dwd_member_info values('20080501111','wang',20000,30);
insert into table dw.dwd_member_info as select * from dw.dwd_member_info_others;
insert overwrite table dw.dwd_member_info as select * from dw.dwd_member_info_others;  --覆盖数据 
--分区插入
insert into table dw.dwd_member_info_part partition(statis_date='20210323')
select * from dw.dwd_member_info ;
--动态分区插入
hive.exec.dynamic.partition =true;             --否开启动态分区功能
hive.exec.max.dynamic.partitions.pernode =100; --动态分区最大数量
set hive.exec.dynamic.partition.mode=nonstrict;--允许所有的分区字段都可以使用动态分区
insert into table dw.dwd_member_info_part partition(statis_date)
select * from dw.dwd_member_info_part;
truncate table dw.dwd_member_info;

- 其他操作

--查看数据库
show databases;  --like '*hive*'
--查看表
show tables;     --like '*member_info*'
--查看hive自带函数
show functions;  --like '*day*'
--查看函数用法
desc function data_add;
--临时表，当前会话有效
create table temporary tmp.session_table;
--分组+排序
select a.*, row_number() over(partition by a.age order by member_id) as rn
  from dw.dwd_member_info a;
--or
select a.*, row_number() over(distribute by a.age sort by member_id) as rn
  from dw.dwd_member_info a;
--随机100万
select * from dw.dwd_member_info a order by rand() limit 1000000;
--数据随机分三组
select aa.*, pmod(a.rn / 3) + 1 as group_no
  from (select a.*, row_number() over(order by member_id)
          from dw.dwd_member_info a) aa;
--解析json
--行列转换
--三种特殊类型
--mocro临时函数模式
--生成多行+序列 类似Oracle的connect by
select  posexplode(split(repeat('a',10),'a'));

HIVE任务优化

案例1：

一张几亿的会员信息(dw.dwd_member_info)，需要通过身份证提出几百万指定的数据(temp.asign_info)，结果随机100万，其中会员信息有一亿条信息缺失身份证信息，通过正常过滤方式任务会有数据倾斜，一亿身份证为空的数据在一个节点计算，计算不出结果

select a.*
  from dw.dwd_member_info a
  left join temp.asign_info b
    on a.idcard = b.idcard
 where b.idcard is null
 sort by rand()
 limit 1000000;

优化方案：
身份证为空的数据单独处理，确保集群资源分配平均

select aa.*
  from (
       --为空的单独处理
       select a.*
          from dw.dwd_member_info a
         where nvl(a.idcard, '') = ''
        union all
        --不为空的单独处理
        select a.*
          from dw.dwd_member_info a
          left join temp.asign_info b
            on a.idcard = b.idcard
         where b.idcard is null
           and nvl(a.idcard, '') <> '') aa 
sort by rand() limit 1000000;

案例2：
表空间过大。压缩方案

set mapreduce.map.output.compress=true;
insert overwrite table dw.dwd_member_info
select * from dw.dwd_member_info;