Flume

conf文件下
vim **.conf
a1.sources = r1
a1.channels= c1
a1.sinks = s1

#监控此目录的变化
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /data/zebradata
#增加拦截器 处理读取的数据
a1.sources.r1.interceptors = i1
#增加事件戳
a1.sources.r1.interceptors.i1.type = timestamp
#读取时忽略这样的文件
a1.sources.r1.ignorePattern = ^(.)*\.tmp$

a1.sinks.s1.type = AVRO
a1.sinks.s1.hostname = SX02
a1.sinks.s1.port = 44444

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.sources.r1.channels = c1
a1.sinks.s1.channel = c1

启动命令:
./flume-ng agent --conf …/conf --conf-file …/conf/flume-zebra-server.conf --name a1 -Dflume.root.logger=INFO,console

目的:在数据到达flume所在服务器时拦截 根据正则将数据中的一部分获取
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
a1.sources.r1.interceptors.i1.regex = (?:[\|]\|){14}\d\d*(\d*)\|[^\|]*$
a1.sources.r1.interceptors.i1.serializers = s1
a1.sources.r1.interceptors.i1.serializers.s1.name = timestmap

目的:将数据存入hdfs 并且存储时以时间命名
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop01:9000/flux/reportTime=%Y-%m-%d
a1.sinks.k1.hdfs.fileType = DataStream
//rollInterval表示间隔多长时间 临时文件变为最终文件
a1.sinks.k1.hdfs.rollInterval = 30
//rollSize表示临时文件大小变成多大 变为最终文件
a1.sinks.k1.hdfs.rollSize = 0
//rollCount表示当events数据达到该数量时 变为最终文件
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.timeZone = GMT+8

数据流向hdfs和kafka两个方向

#声明agent
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2

#声明source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444

a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
a1.sources.r1.interceptors.i1.regex = (?:[\|]\|){14}\d\d*(\d*)\|[^\|]*$
a1.sources.r1.interceptors.i1.serializers = s1
a1.sources.r1.interceptors.i1.serializers.s1.name = timestamp

#声明sinks
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.path = hdfs://SX03:8020/flux/reportTime=%Y-%m-%d
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.rollInterval = 30
a1.sinks.k1.hdfs.rollSize = 0
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.timeZone = GMT+8

a1.sinks.k2.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k2.brokerList = SX01:9092,SX02:9092,SX03:9092
a1.sinks.k2.topic = enbook
a1.sinks.k2.batchSize = 20

#声明channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100

#绑定关系
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值