flume采集数据,kafka同时作为channel,sink,保证了数据的高容错
配置文件
a1.sources=r1
a1.channels=c1 c2
# configure source
# TAILDIR 可见监控文件组的变化
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /home/hadoop/zlwhouse/position/log_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /home/hadoop/zlwhouse/logs/app.+
a1.sources.r1.fileHeader = true
a1.sources.r1.channels = c1 c2
#interceptor 根据拦截器 对 event 进行过滤
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = CustomIntercepter$Builder
# flume内置的ChannelSelector有两种,分别是Replicating和Multiplexing。
# 默认为Replicating ,会将所有event拷贝到所有的channel中
# Multiplexing 会根据 event header中的某项属性 将event分配到不同的channel
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = topic
a1.sources.r1.selector.mapping.topic_start = c1
a1.sources.r1.selector.mapping.topic_event = c2
# configure channel
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hxb01:9092,hxb02:9092,hxb03:9092
a1.channels.c1.kafka.topic = topic_start
#该属性为 false 只会将 event body部分 传入channel
a1.channels.c1.parseAsFlumeEvent = false
a1.channels.c1.kafka.consumer.group.id = flume-consumer
a1.channels.c2.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c2.kafka.bootstrap.servers =hxb01:9092,hxb02:9092,hxb03:9092
a1.channels.c2.kafka.topic = topic_event
a1.channels.c2.parseAsFlumeEvent = false
a1.channels.c2.kafka.consumer.group.id = flume-consumer
flume消费kafka的topic到hdfs:配置文件
a1.sources=r1 r2
a1.channels=c1 c2
a1.sinks=k1 k2
## source1
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r1.batchSize = 5000
a1.sources.r1.batchDurationMillis = 2000
a1.sources.r1.kafka.bootstrap.servers = hxb01:9092,hxb02:9092,hxb03:9092
a1.sources.r1.kafka.topics=topic_start
## source2
a1.sources.r2.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r2.batchSize = 5000
a1.sources.r2.batchDurationMillis = 2000
a1.sources.r2.kafka.bootstrap.servers = hxb01:9092,hxb02:9092,hxb03:9092
a1.sources.r2.kafka.topics=topic_event
## channel1
a1.channels.c1.type = file
a1.channels.c1.checkpointDir = /home/hadoop/zlwhouse/checkpoint/behavior1
a1.channels.c1.dataDirs = /home/hadoop/zlwhouse/checkpoint/data/behavior1/
a1.channels.c1.maxFileSize = 2146435071
a1.channels.c1.capacity = 1000000
a1.channels.c1.keep-alive = 6
## channel2
a1.channels.c2.type = file
a1.channels.c2.checkpointDir = /home/hadoop/zlwhouse/checkpoint/behavior2
a1.channels.c2.dataDirs = /home/hadoop/zlwhouse/checkpoint/data/behavior2/
a1.channels.c2.maxFileSize = 2146435071
a1.channels.c2.capacity = 1000000
a1.channels.c2.keep-alive = 6
## sink1
a1.sinks.k1.type = hdfs
# a1.sinks.k1.type = logger
# 注意::如果加入了上面一行,该项属性属性进行了多次赋值,后面的会覆盖前面的
#
a1.sinks.k1.hdfs.path = hdfs://myCluster/origin_data/gmall/log/topic_start/%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = logstart-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = second
a1.sinks.k1.hdfs.fileType = DataStream
##sink2
a1.sinks.k2.type = hdfs
a1.sinks.k2.hdfs.path = hdfs://myCluster/origin_data/gmall/log/topic_event/%Y-%m-%d
a1.sinks.k2.hdfs.filePrefix = logevent-
a1.sinks.k2.hdfs.round = true
a1.sinks.k2.hdfs.roundValue = 10
a1.sinks.k2.hdfs.roundUnit = second
a1.sinks.k2.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k2.hdfs.rollInterval = 10
a1.sinks.k2.hdfs.rollSize = 134217728
a1.sinks.k2.hdfs.rollCount = 0
a1.sources.r1.channels = c1
a1.sinks.k1.channel= c1
a1.sources.r2.channels = c2
a1.sinks.k2.channel= c2