flume to hdfs lzo 压缩支持

1 篇文章 0 订阅

-- 参考: http://wzktravel.github.io/2015/12/11/flume-ng/
         https://blog.csdn.net/sw562810770/article/details/79611318
         https://www.quora.com/Whats-the-difference-between-the-LzoCodec-and-the-LzopCodec-in-Hadoop-LZO

------------------------------------------------------------------------------------------------------------
-- ###################################################################################################### --

-- Flume 版本: 1.8.0 

-- 若安装中遇到问题,欢迎加QQ群交流:661945126

cd /usr/local/flume/

vi gs_serveraccess_flume-client.conf
------------------------------------------------------------------------------------------------------------
#agent name
a1.sources = r1
a1.sinks = k1
a1.channels = c1

#--------------------------------------------------------------------------------------
#Setting Channel: c1
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000000
a1.channels.c1.transactionCapacity=10000
a1.channels.c1.keep-alive=3

#--------------------------------------------------------------------------------------
#Setting Sources: r1:access, r2:adclick, r3:adporxy
a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /data/nginx/access.log
a1.sources.r1.fileHeader = true
a1.sources.r1.deserializer = org.apache.flume.sink.solr.morphline.BlobDeserializer$Builder
a1.sources.r1.batchsize = 500
a1.sources.r1.interceptors = i1 i2 i3
a1.sources.r1.interceptors.i1.type = static
a1.sources.r1.interceptors.i1.preserveExisting = false
a1.sources.r1.interceptors.i1.key = datacenter
a1.sources.r1.interceptors.i1.value = gs_serverac_access
a1.sources.r1.interceptors.i2.type = timestamp
a1.sources.r1.interceptors.i3.type = tv.yilan.FlumeETL.AppendIPInterceptor$Builder

#--------------------------------------------------------------------------------------
# Setting Sink
a1.sinks.k1.channel=c1
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path=hdfs://emr-cluster/user/flume/serveraccess/gs_accesslog/%Y/%m/%d/%H
a1.sinks.k1.hdfs.fileType=CompressedStream
a1.sinks.k1.hdfs.codeC=lzop # 注意:这里用 lzop;否则:创建 hive 外部分区表映射到相关路径,无法查询数据
a1.sinks.k1.hdfs.writeFormat=TEXT
a1.sinks.k1.hdfs.rollInterval=300
a1.sinks.k1.hdfs.rollSize=307200000
a1.sinks.k1.hdfs.callTimeout=120000
a1.sinks.k1.hdfs.appendTimeout=80000
a1.sinks.k1.hdfs.txnEventMax=10000
a1.sinks.k1.hdfs.idleTimeout=0
a1.sinks.k1.hdfs.rollCount=0
a1.sinks.k1.hdfs.filePrefix=accesslog_%Y%m%d%H_%[IP]
a1.sinks.k1.hdfs.fileSuffix=.lzo
a1.sinks.k1.hdfs.round=true
a1.sinks.k1.hdfs.roundValue=1
a1.sinks.k1.hdfs.roundUnit=minute
a1.sinks.k1.hdfs.useLocalTimeStamp=true
a1.sinks.k1.hdfs.threadsPoolSize=15
a1.sinks.k1.hdfs.minBlockReplicas=1
a1.sinks.k1.hdfs.batchSize=10000

------------------------------------------------------------------------------------------------------------
-- ###################################################################################################### --

-- 启动 flume 
./bin/flume-ng agent --conf conf --conf-file conf/gs_serveraccess_flume-client.conf \
--name a1 -Dflume.root.logger=INFO,console > gs_serveraccess_flume-client.log 2>&1 &

------------------------------------------------------------------------------------------------------------
-- ###################################################################################################### --
-- 创建 hive 表测试:
drop table default.lzo_test;
CREATE EXTERNAL TABLE IF NOT EXISTS default.lzo_test (
        logcontent STRING
    )
    PARTITIONED BY (year STRING, month STRING, day STRING, hour STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS INPUTFORMAT  'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    location 'hdfs://emr-cluster/user/flume/serveraccess/gs_accesslog';

-- 添加 hive 表分区
alter table default.lzo_test add partition(year='2018',month='07',day='31',hour='21')
location 'hdfs://emr-cluster/user/flume/serveraccess/gs_accesslog/2018/07/31/21';

-- 生成 lzo 索引文件,以提高查询效率:
$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/lib/hadoop-lzo-0.4.21-SNAPSHOT.jar \
com.hadoop.compression.lzo.DistributedLzoIndexer \
/user/flume/serveraccess/gs_accesslog/2018/07/31/21;

-- 查询测试:
select * from default.lzo_test limit 10;

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值