Flume

最新推荐文章于 2023-02-03 00:00:13 发布

qq_31827693

最新推荐文章于 2023-02-03 00:00:13 发布

阅读量122

点赞数

本文链接：https://blog.csdn.net/qq_31827693/article/details/100027394

版权

conf文件下
vim **.conf
a1.sources = r1
a1.channels= c1
a1.sinks = s1

#监控此目录的变化
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /data/zebradata
#增加拦截器处理读取的数据
a1.sources.r1.interceptors = i1
#增加事件戳
a1.sources.r1.interceptors.i1.type = timestamp
#读取时忽略这样的文件
a1.sources.r1.ignorePattern = ^(.)*\.tmp$

a1.sinks.s1.type = AVRO
a1.sinks.s1.hostname = SX02
a1.sinks.s1.port = 44444

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.sources.r1.channels = c1
a1.sinks.s1.channel = c1

启动命令：
./flume-ng agent --conf …/conf --conf-file …/conf/flume-zebra-server.conf --name a1 -Dflume.root.logger=INFO,console

目的：在数据到达flume所在服务器时拦截根据正则将数据中的一部分获取
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
a1.sources.r1.interceptors.i1.regex = ^(?:[\|]\|){14}\d\d*(\d*)\|[^\|]*$
a1.sources.r1.interceptors.i1.serializers = s1
a1.sources.r1.interceptors.i1.serializers.s1.name = timestmap

目的：将数据存入hdfs 并且存储时以时间命名
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop01:9000/flux/reportTime=%Y-%m-%d
a1.sinks.k1.hdfs.fileType = DataStream
//rollInterval表示间隔多长时间临时文件变为最终文件
a1.sinks.k1.hdfs.rollInterval = 30
//rollSize表示临时文件大小变成多大变为最终文件
a1.sinks.k1.hdfs.rollSize = 0
//rollCount表示当events数据达到该数量时变为最终文件
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.timeZone = GMT+8

数据流向hdfs和kafka两个方向

#声明agent
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2

#声明source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444

a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
a1.sources.r1.interceptors.i1.regex = ^(?:[\|]\|){14}\d\d*(\d*)\|[^\|]*$
a1.sources.r1.interceptors.i1.serializers = s1
a1.sources.r1.interceptors.i1.serializers.s1.name = timestamp

#声明sinks
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.path = hdfs://SX03:8020/flux/reportTime=%Y-%m-%d
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.rollInterval = 30
a1.sinks.k1.hdfs.rollSize = 0
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.timeZone = GMT+8

a1.sinks.k2.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k2.brokerList = SX01:9092,SX02:9092,SX03:9092
a1.sinks.k2.topic = enbook
a1.sinks.k2.batchSize = 20

#声明channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100

#绑定关系
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2