flume数据数据 一路到hdfs 供离线分析,一路经kafka消息中间件 配置案例
flume-hdfs-kafka.sources = r1
flume-hdfs-kafka.channels = c1
flume-hdfs-kafka.sinks = hdfs-sink kafka-sink
#配置source
flume-hdfs-kafka.sources.r1.type = TAILDIR
flume-hdfs-kafka.sources.r1.channels = c1
flume-hdfs-kafka.sources.r1.filegroups = f1
flume-hdfs-kafka.sources.r1.filegroups.f1 = /home/bigdata/flume/data/*.txt
flume-hdfs-kafka.sources.r1.positionFile = /var/log/flume/taildir_position.json
#配置channel
flume-hdfs-kafka.channels.c1.type = memory
#配置hdfs sink
flume-hdfs-kafka.sinks.hdfs-sink.type = hdfs
flume-hdfs-kafka.sinks.hdfs-sink.channel = c1
flume-hdfs-kafka.sinks.hdfs-sink.hdfs.path = hdfs://spark001:8020/flume/events/%y%m%d%H%M
flume-hdfs-kafka.sinks.hdfs-sink.hdfs.filePrefix = events-
flume-hdfs-kafka.sinks.hdfs-sink.hdfs.rollSize = 10000000
flume-hdfs-kafka.sinks.hdfs-sink.hdfs.round = true
flume-hdfs-kafka.sinks.hdfs-sink.hdfs.roundValue = 10
flume-hdfs-kafka.sinks.hdfs-sink.hdfs.roundUnit = minute
flume-hdfs-kafka.sinks.hdfs-sink.hdfs.useLocalTimeStamp = true
#配置kafkf sink
flume-hdfs-kafka.sinks.kafka-sink.channel = c1
flume-hdfs-kafka.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
flume-hdfs-kafka.sinks.kafka-sink.kafka.topic = flumetopic
flume-hdfs-kafka.sinks.kafka-sink.kafka.bootstrap.servers = spark003:9092,spark004:9092,spark005:9092
flume-hdfs-kafka.sinks.kafka-sink.kafka.flumeBatchSize = 20
flume-hdfs-kafka.sinks.kafka-sink.kafka.producer.acks = 1
flume-hdfs-kafka.sinks.kafka-sink.kafka.producer.linger.ms = 1
flume-hdfs-kafka.sinks.kafka-sink.producer.compression.type = snappy
启动命令
flume-ng agent --name flume-hdfs-kafka \
--conf /home/bigdata/flume/conf \
--conf-file /home/bigdata/flume/conf/flume-hdfs-kafka.properties \
-Dflume.root.logger=INFO,console
Taildir 可以记录 文件消费的位置
1:断点续传
2:追加数据可正确收集
taildir.TaildirSource: Closed file: /home/bigdata/flume/data/cost.txt, inode: 33582977, pos: 228