2.1 配置采集方案
通过上游采集,下游直接写入到hdfs中。配置两个sink用于sink高可用
上游配置:
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2
#sources:用到TAILDIR类型,记录偏移量,安全,快开重连可以接着上次的位置继续采集数据
a1.sources.r1.channels = c1
a1.sources.r1.type = TAILDIR
#根据终端定义相应地文件组,分别制定其对应的日志存储位置进行数据的读取
a1.sources.r1.filegroups = g1 g2
a1.sources.r1.filegroups.g1 = /opt/data/logdata/app/event.*
a1.sources.r1.filegroups.g2 = /opt/data/logdata/wx/event.*
#
a1.sources.r1.headers.g1.datatype = app
a1.sources.r1.headers.g2.datatype = wx
a1.sources.r1.batchSize = 100
#拦截器
a1.sources.r1.interceptors = i1
#拦截器的类型为代码中内部类的全类名
a1.sources.r1.interceptors.i1.type =
cn.doitedu.flume.interceptor.FieldEncryptInterceptor$FieldEncryptInterceptorBuilder
a1.sources.r1.interceptors.i1.headerName = timestamp
#拦截器要取出的字段
a1.sources.r1.interceptors.i1.timestamp_field = timeStamp
a1.sources.r1.interceptors.i1.to_encrypt_field = account
#channels
a1.channels.c1.type = file
#Checkpoint信息保存目录
a1.channels.c1.checkpointDir = /opt/data/flumedata/file-channel/checkpoint
#Event数据缓存目录
a1.channels.c1.dataDirs = /opt/data/flumedata/file-channel/data
#sinks
a1.sinks.k1.channel = c1
a1.sinks.k1.type = avro
#指定下游主机名
a1.sinks.k1.hostname = linux02
a1.sinks.k1.port = 41414
a1.sinks.k1.batch-size = 100
a1.sinks.k2.channel = c1
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = linux03
a1.sinks.k2.port = 41414
a1.sinks.k2.batch-size = 100
#定义sink组及其配套的sink处理器
a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
#指定sink的take方式为失败选择(failover) 还有一个为负载均衡(round roling)
a1.sinkgroups.g1.processor.type = failover
#后面的值谁大选择谁
a1.sinkgroups.g1.processor.priority.k1 = 5
a1.sinkgroups.g1.processor.priority.k2 = 1
#主sink失败后,停用惩罚时间
a1.sinkgroups.g1.processor.maxpenalty = 10000
下游配置:
a1.sources = r1
a1.channels = c1
a1.sinks = k1
#sources 与上游的sink相对应
a1.sources.r1.channels = c1
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 41414
a1.sources.r1.batchSize = 100