-- 参考: http://wzktravel.github.io/2015/12/11/flume-ng/
https://blog.csdn.net/sw562810770/article/details/79611318
https://www.quora.com/Whats-the-difference-between-the-LzoCodec-and-the-LzopCodec-in-Hadoop-LZO
------------------------------------------------------------------------------------------------------------
-- ###################################################################################################### --
-- Flume 版本: 1.8.0
-- 若安装中遇到问题,欢迎加QQ群交流:661945126
cd /usr/local/flume/
vi gs_serveraccess_flume-client.conf
------------------------------------------------------------------------------------------------------------
#agent name
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#--------------------------------------------------------------------------------------
#Setting Channel: c1
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000000
a1.channels.c1.transactionCapacity=10000
a1.channels.c1.keep-alive=3
#--------------------------------------------------------------------------------------
#Setting Sources: r1:access, r2:adclick, r3:adporxy
a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /data/nginx/access.log
a1.sources.r1.fileHeader = true
a1.sources.r1.deserializer = org.apache.flume.sink.solr.morphline.BlobDeserializer$Builder
a1.sources.r1.batchsize = 500
a1.sources.r1.interceptors = i1 i2 i3
a1.sources.r1.interceptors.i1.type = static
a1.sources.r1.interceptors.i1.preserveExisting = false
a1.sources.r1.interceptors.i1.key = datacenter
a1.sources.r1.interceptors.i1.value = gs_serverac_access
a1.sources.r1.interceptors.i2.type = timestamp
a1.sources.r1.interceptors.i3.type = tv.yilan.FlumeETL.AppendIPInterceptor$Builder
#--------------------------------------------------------------------------------------
# Setting Sink
a1.sinks.k1.channel=c1
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path=hdfs://emr-cluster/user/flume/serveraccess/gs_accesslog/%Y/%m/%d/%H
a1.sinks.k1.hdfs.fileType=CompressedStream
a1.sinks.k1.hdfs.codeC=lzop # 注意:这里用 lzop;否则:创建 hive 外部分区表映射到相关路径,无法查询数据
a1.sinks.k1.hdfs.writeFormat=TEXT
a1.sinks.k1.hdfs.rollInterval=300
a1.sinks.k1.hdfs.rollSize=307200000
a1.sinks.k1.hdfs.callTimeout=120000
a1.sinks.k1.hdfs.appendTimeout=80000
a1.sinks.k1.hdfs.txnEventMax=10000
a1.sinks.k1.hdfs.idleTimeout=0
a1.sinks.k1.hdfs.rollCount=0
a1.sinks.k1.hdfs.filePrefix=accesslog_%Y%m%d%H_%[IP]
a1.sinks.k1.hdfs.fileSuffix=.lzo
a1.sinks.k1.hdfs.round=true
a1.sinks.k1.hdfs.roundValue=1
a1.sinks.k1.hdfs.roundUnit=minute
a1.sinks.k1.hdfs.useLocalTimeStamp=true
a1.sinks.k1.hdfs.threadsPoolSize=15
a1.sinks.k1.hdfs.minBlockReplicas=1
a1.sinks.k1.hdfs.batchSize=10000
------------------------------------------------------------------------------------------------------------
-- ###################################################################################################### --
-- 启动 flume
./bin/flume-ng agent --conf conf --conf-file conf/gs_serveraccess_flume-client.conf \
--name a1 -Dflume.root.logger=INFO,console > gs_serveraccess_flume-client.log 2>&1 &
------------------------------------------------------------------------------------------------------------
-- ###################################################################################################### --
-- 创建 hive 表测试:
drop table default.lzo_test;
CREATE EXTERNAL TABLE IF NOT EXISTS default.lzo_test (
logcontent STRING
)
PARTITIONED BY (year STRING, month STRING, day STRING, hour STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location 'hdfs://emr-cluster/user/flume/serveraccess/gs_accesslog';
-- 添加 hive 表分区
alter table default.lzo_test add partition(year='2018',month='07',day='31',hour='21')
location 'hdfs://emr-cluster/user/flume/serveraccess/gs_accesslog/2018/07/31/21';
-- 生成 lzo 索引文件,以提高查询效率:
$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/lib/hadoop-lzo-0.4.21-SNAPSHOT.jar \
com.hadoop.compression.lzo.DistributedLzoIndexer \
/user/flume/serveraccess/gs_accesslog/2018/07/31/21;
-- 查询测试:
select * from default.lzo_test limit 10;