hive spark conf

CREATE TABLE org_userbehavior_all_yunzhi
(
user_id Int
,event_time bigint
,behivior_id SMALLINT
,behivior_name String
,behivior_pop String
,record_date String
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
location '/user/hive/warehouse/org_userbehavior_all_yunzhi';
LOAD DATA INPATH '/sparklib/data' OVERWRITE INTO TABLE org_userbehavior_all_yunzhi;


CREATE TABLE org_userbehavior_all_yunzhi_parquet
(
user_id Int
,event_time Int
,behivior_pop String
)
partitioned by (record_date String,behivior_id SMALLINT )
STORED AS PARQUET
location '/user/hive/warehouse/org_userbehavior_all_yunzhi_parquet';

INSERT OVERWRITE TABLE org_userbehavior_funnel_dd PARTITION(record_date,behivior_id) SELECT user_id, cast (event_time/1000 as Int) as record_date ,behivior_pop,record_date,behivior_id from org_userbehavior_all;
-- 4538072102 /user/hive/warehouse/org_userbehavior_funnel_dd
-- hadoop fs -ls -R /user/hive/warehouse/org_userbehavior_funnel_dd | wc -l 3599

CREATE TABLE org_userbehavior_all_yunzhi_parquet
(
user_id Int
,event_time Int
,behivior_pop String
,behivior_id SMALLINT
)
partitioned by (record_date String )
STORED AS PARQUET
location '/user/hive/warehouse/org_userbehavior_all_yunzhi_parquet';

set hive.exec.dynamic.partition.mode=nonstrict
set mapred.max.split.size=1024000000;
set mapred.min.split.size.per.node=1024000000;
set mapred.min.split.size.per.rack=1024000000;
set mapred.reduce.tasks=30;
set hive.exec.reducers.bytes.per.reducer=1073741824;
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
set hive.exec.reducers.bytes.per.reducer=50000000;


set mapred.max.split.size=100000000;
set mapred.min.split.size.per.node=200000000;
set mapred.min.split.size.per.rack=200000000;
set mapred.reduce.tasks=100;


set spark.sql.map.partitions=100;
set spark.sql.shuffle.partitions=40;
set spark.sql.reduce.partitions=10;


org.apache.hadoop.io.compress.GzipCodec
org.apache.hadoop.io.compress.DefaultCodec
org.apache.hadoop.io.compress.Bzip2Codec
org.apache.hadoop.io.compress.SnappyCodec

set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec


INSERT OVERWRITE TABLE org_userbehavior_all_yunzhi_parquet PARTITION(record_date) SELECT user_id, cast (event_time/1000 as Int) as event_time,behivior_pop,behivior_id,record_date from org_userbehavior_all_yunzhi;
-- 2908248821 /user/hive/warehouse/org_userbehavior_funnel_dd2
-- hadoop fs -ls -R /user/hive/warehouse/org_userbehavior_funnel_dd2 | wc -l 2960

-- todo partitions 不包含 behivior_id 压缩比更好,和字典压缩有关. 生成的文件数太多,要修改 shuffer 数量来减少 文件数.


-- udf
-- funnel_merge tuple
-- funnel_merge2 time *10 + eventid
-- funnel_sum


CREATE TABLE org_userbehavior_all_yunzhi_behiviorid_parquet
(
user_id Int
,event_time Int
,behivior_pop String
)
partitioned by (record_date String,behivior_id SMALLINT)
STORED AS PARQUET
location '/user/hive/warehouse/org_userbehavior_all_yunzhi_behiviorid_parquet';
INSERT OVERWRITE TABLE org_userbehavior_all_yunzhi_behiviorid_parquet PARTITION(record_date,behivior_id) SELECT user_id, cast (event_time/1000 as Int) as event_time ,behivior_pop,record_date,behivior_id from org_userbehavior_all_yunzhi;

set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec
INSERT OVERWRITE TABLE org_userbehavior_all_yunzhi_parquet PARTITION(record_date) SELECT user_id, cast (event_time/1000 as Int) as event_time,behivior_pop,behivior_id,record_date from org_userbehavior_all_yunzhi;


set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task=128000000;
set hive.merge.smallfiles.avgsize=15000000;

set hive.merge.mapfiles = true #在Map-only的任务结束时合并小文件
set hive.merge.mapredfiles = true #在Map-Reduce的任务结束时合并小文件
set hive.merge.size.per.task = 256*1000*1000 #合并文件的大小
set hive.merge.smallfiles.avgsize=16000000 #当输出文件的平均大小小于该值时,启动一个独立的map-reduce任务进行文件merge


set hive.exec.dynamic.partition.mode=nonstrict;
set mapred.max.split.size=256000000;
set mapred.min.split.size.per.node=256000000;
set mapred.min.split.size.per.rack=256000000;
set mapred.reduce.tasks=30;
set hive.exec.reducers.bytes.per.reducer=1073741824;
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
set spark.sql.shuffle.partitions=40;
set spark.sql.reduce.partitions=10;
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task=128000000;
set hive.merge.smallfiles.avgsize=50000000;


org.apache.hadoop.io.compress.GzipCodec
org.apache.hadoop.io.compress.DefaultCodec
org.apache.hadoop.io.compress.Bzip2Codec
org.apache.hadoop.io.compress.SnappyCodec


set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec


select funnelsum(funneldata) from ( select user_id,funnelmergecount(behivior_id,event_time,'10004,10007',1000000,1451577600) as funneldata from org_userbehavior_all_yunzhi_parquet where behivior_id in (10004,10007) and (behivior_id != 10004 or jsoninfogetstring(behivior_pop,'brand') = 'Apple') group by user_id ) temp2
select funnelsum(funneldata) from ( select user_id,funnelmergecount(behivior_id,event_time,'10002,10003,10004,10007,10008',86400,1451577600) as funneldata from org_userbehavior_all_yunzhi_parquet where record_date in ('20170101','20170102','20170103','20170104','20170105','20170106','20170107') and behivior_id in (10002,10003,10004,10007,10008) and (behivior_id != 10003 or jsoninfogetstring(behivior_pop,'brand') = 'Apple') and (behivior_id != 10004 or jsoninfogetdouble(behivior_pop,'price') > 5000) group by user_id ) temp2
select funnelsum(funneldata) from ( select user_id,funnelmergecount(behivior_id,event_time,'10002,10003,10004,10007,10008',86400,1451577600) as funneldata from org_userbehavior_all_yunzhi_parquet where record_date in ('20170101','20170102','20170103','20170104','20170105','20170106','20170107') and behivior_id in (10002,10003,10004,10007,10008) group by user_id ) temp2 ;
select funnelsum(funneldata) from ( select user_id,funnelmergecount(behivior_id,event_time,'10002,10003,10004,10007,10008',86400,1451577600) as funneldata from org_userbehavior_all_yunzhi_parquet where behivior_id in (10002,10003,10004,10007,10008) group by user_id ) temp2 ;


CREATE TABLE org_userbehavior_all_yunzhi_parquet_userslice
(
user_id Int
,event_time Int
,behivior_pop String
,behivior_id SMALLINT
)
partitioned by (record_date String,user_id_slice Int )
STORED AS PARQUET
location '/user/hive/warehouse/org_userbehavior_all_yunzhi_parquet_userslice';


要如何把 天的输出 ,变成一个区间 。 一天 一天的 处理 ? 把 utdid 根据 Hash 分区


set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE org_userbehavior_all_yunzhi_parquet_userslice PARTITION(record_date,user_id_slice) SELECT user_id, cast (event_time/1000 as Int) as event_time,behivior_pop,behivior_id,record_date,user_id%8 as mode from org_userbehavior_all_yunzhi where record_date = 20170101;


输出小文件合并的改进 (增加自动合并结果文件)

当spark.sql.shuffle.partitions设置的比较大且结果数据集比较小时,会产生大量的小文件(文件数等同spark.sql.shuffle.partitions)。
解决办法:
在最后的执行计划中加入一个repartition transformation。通过参数控制最终的partitions数且不影响shuffle partition的数量。
使用方法:
set spark.sql.result.partitions=10;

作者:Albert陈凯
链接:http://www.jianshu.com/p/00328171b8a6
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

配置属性
set hive.merge.sparkfiles=true;
set hive.merge.smallfiles.avgsize=50000000;


set spark.sql.shuffle.partitions=1;
set spark.sql.reduce.partitions=1;


# see HIVE-9153
mapreduce.input.fileinputformat.split.maxsize=750000000
hive.vectorized.execution.enabled=true

hive.cbo.enable=true
hive.optimize.reducededuplication.min.reducer=4
hive.optimize.reducededuplication=true
hive.orc.splits.include.file.footer=false
set hive.merge.mapfiles=true;
set hive.merge.sparkfiles=true;
set hive.merge.smallfiles.avgsize=16000000;
set hive.merge.size.per.task=256000000;
hive.merge.orcfile.stripe.level=true
hive.auto.convert.join=true
hive.auto.convert.join.noconditionaltask=true
hive.auto.convert.join.noconditionaltask.size=894435328
hive.optimize.bucketmapjoin.sortedmerge=false
hive.map.aggr.hash.percentmemory=0.5
hive.map.aggr=true
hive.optimize.sort.dynamic.partition=false
hive.stats.autogather=true
hive.stats.fetch.column.stats=true
hive.vectorized.execution.reduce.enabled=false
hive.vectorized.groupby.checkinterval=4096
hive.vectorized.groupby.flush.percent=0.1
hive.compute.query.using.stats=true
hive.limit.pushdown.memory.usage=0.4
hive.optimize.index.filter=true
hive.exec.reducers.bytes.per.reducer=67108864
hive.smbjoin.cache.rows=10000
hive.exec.orc.default.stripe.size=67108864
hive.fetch.task.conversion=more
hive.fetch.task.conversion.threshold=1073741824
hive.fetch.task.aggr=false
mapreduce.input.fileinputformat.list-status.num-threads=5
spark.kryo.referenceTracking=false
spark.kryo.classesToRegister=org.apache.hadoop.hive.ql.io.HiveKey,org.apache.hadoop.io.BytesWritable,org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch

set hive.exec.dynamic.partition.mode=nonstrict
set hive.merge.mapfiles=true;
set hive.merge.sparkfiles=true;
set hive.merge.smallfiles.avgsize=128000000;
set hive.merge.size.per.task=256000000;

INSERT OVERWRITE TABLE org_userbehavior_all_yunzhi_parquet_userslice PARTITION(record_date,user_id_slice) SELECT user_id, cast (event_time/1000 as Int) as event_time,behivior_pop,behivior_id,record_date,user_id%8 as mode from org_userbehavior_all_yunzhi
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值