Hive相关笔记

最新推荐文章于 2024-10-05 21:16:09 发布

悟沉

最新推荐文章于 2024-10-05 21:16:09 发布

阅读量136

点赞数

分类专栏： hive 文章标签： hive

本文链接：https://blog.csdn.net/weixin_40858114/article/details/81285334

版权

hive 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

hive相关

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF

数据倾斜时添加配置

set hive.groupby.skewindata=true;
set hive.map.aggr = true;

建表规范

CREATE TABLE IF NOT EXISTS 表名(XXXX
)comment'备注信息 负责人:XXX'
PARTITIONED BY (
dt string comment 'report date'
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC
TBLPROPERTIES (
'orc.compress'='SNAPPY'); ---注意务必大写

UNION ALL 并行

set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=3;

随机抽取

distribute by rand()
sort by
        rand() limit 50

窗口函数

row_number() over(partition by aa.lpid, substring(aa.time, 0, 10)
        order by aa.time) as order_seq

collect_set使用详解

--求出一个数据表中在某天内首次登陆的人；可以借助collect_set来处理sql:
select count(a.id) 
from (select id,collect_set(time) as t from t_action_login where time<='20150906' group by id) as a where size(a.t)=1 and a.t[0]='20150906';

建立分区表

    create table
            if not exists dev.xxx

(           
            c1    string   comment '订单号'    ,
            c2  string  comment '父单号'
)
      partitioned by(dt string) row format delimited fields
      TERMINATED by '\t';
    ----------------------------  

set hive.exec.dynamic.partition.mode = nonstrict;      
insert overwrite table dev.xxx 
partition(dt)

 SELECT
            b.c1         ,
            b.c2  ,

    FROM table QQ

行转列

    SELECT
            pin,features_set,f
    from
            app.app_dw_risk_gooduser_creditscore_features_wide_da 
            lateral view explode(split(features_set,','))temp
            as f 
    where
            dt = sysdate( - 1) LIMIT 1000

防止数据倾斜配置

    set hive.groupby.skewindata = true;
    set hive.map.aggr = true;
    http://www.cnblogs.com/end/archive/2012/06/19/2554582.html

动态分区异常 Fatal error occurred when node tried to create too many dynamic partitions

SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=10000;
--default = 100

并行合并小文件

set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask = true;
set hive.auto.convert.join.noconditionaltask.size = 8388608;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=100000;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=32;

-- 合并小文件
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task=512000000;
set hive.merge.smallfiles.avgsize=256000000;

load-file写表

    create table
            if not exists dev.dev_table

(           
            c1    string   comment 'xxx'    ,
            c2      string     comment 'xxx'    ,
            c3     int     comment 'xxx'   
)
      partitioned by(dt string) row format delimited fields
      TERMINATED by '\t';
LOAD DATA LOCAL INPATH '/home/xxxx/xxxxx/xxxx/xxx.txt' OVERWRITE INTO TABLE dev.dev_table;