hive相关
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
数据倾斜时添加配置
set hive.groupby.skewindata=true; set hive.map.aggr = true;
建表规范
CREATE TABLE IF NOT EXISTS 表名(XXXX )comment'备注信息 负责人:XXX' PARTITIONED BY ( dt string comment 'report date' ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS ORC TBLPROPERTIES ( 'orc.compress'='SNAPPY'); ---注意务必大写
UNION ALL 并行
set hive.exec.parallel=true; set hive.exec.parallel.thread.number=3;
随机抽取
distribute by rand() sort by rand() limit 50
窗口函数
row_number() over(partition by aa.lpid, substring(aa.time, 0, 10) order by aa.time) as order_seq
collect_set使用详解
--求出一个数据表中在某天内首次登陆的人;可以借助collect_set来处理sql:
select count(a.id)
from (select id,collect_set(time) as t from t_action_login where time<='20150906' group by id) as a where size(a.t)=1 and a.t[0]='20150906';
- 建立分区表
create table
if not exists dev.xxx
(
c1 string comment '订单号' ,
c2 string comment '父单号'
)
partitioned by(dt string) row format delimited fields
TERMINATED by '\t';
----------------------------
set hive.exec.dynamic.partition.mode = nonstrict;
insert overwrite table dev.xxx
partition(dt)
SELECT
b.c1 ,
b.c2 ,
FROM table QQ
- 行转列
SELECT
pin,features_set,f
from
app.app_dw_risk_gooduser_creditscore_features_wide_da
lateral view explode(split(features_set,','))temp
as f
where
dt = sysdate( - 1) LIMIT 1000
- 防止数据倾斜配置
set hive.groupby.skewindata = true;
set hive.map.aggr = true;
http://www.cnblogs.com/end/archive/2012/06/19/2554582.html
- 动态分区异常 Fatal error occurred when node tried to create too many dynamic partitions
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=10000;
--default = 100
- 并行 合并小文件
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask = true;
set hive.auto.convert.join.noconditionaltask.size = 8388608;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=100000;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=32;
-- 合并小文件
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task=512000000;
set hive.merge.smallfiles.avgsize=256000000;
- load-file写表
create table
if not exists dev.dev_table
(
c1 string comment 'xxx' ,
c2 string comment 'xxx' ,
c3 int comment 'xxx'
)
partitioned by(dt string) row format delimited fields
TERMINATED by '\t';
LOAD DATA LOCAL INPATH '/home/xxxx/xxxxx/xxxx/xxx.txt' OVERWRITE INTO TABLE dev.dev_table;