sources
exec类型执行一条命令
a1.sources.s1.type = exec
a1.sources.s1.command = tail -f /export/datas/shells/access.log
channel
memory类型存到内存中
agent.channels.c1.type = memory
#channel的容量最多能存多少个event
agent.channels.c1.capacity = 1000
#单次读取event的个数
agent.channels.c1.transactionCapacity = 100
sink
#avro类型将数据发送到配置的主机名/端口
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = kak01
agent.sinks.k1.port = 45454
拦截器
#################################define the agent###########################
agent.sources = s1 s2
agent.channels = c1
agent.sinks = k1
#define source s1
agent.sources.s1.type = exec
agent.sources.s1.command = tail -f /export/servers/hive-1.1.0-cdh5.14.0/logs/hive.log
agent.sources.s2.type = exec
agent.sources.s2.command = tail -f /export/datas/flume.txt
#add interceptor
agent.sources.s1.interceptors = i1
agent.sources.s1.interceptors.i1.type = static
agent.sources.s1.interceptors.i1.key = filename
agent.sources.s1.interceptors.i1.value = hive
agent.sources.s2.interceptors = i1
agent.sources.s2.interceptors.i1.type = static
agent.sources.s2.interceptors.i1.key = filename
agent.sources.s2.interceptors.i1.value = flume
#define channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
agent.channels.c1.transactionCapacity = 100
#define sink
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = hpsk.bigdata01.com
agent.sinks.k1.port = 45454
#bond
agent.sources.s1.channels = c1
agent.sources.s2.channels = c1
agent.sinks.k1.channel = c1
#################################define the collect###########################
collect.sources = s1
collect.channels = c1
collect.sinks = k1
#define source s1
collect.sources.s1.type = avro
collect.sources.s1.bind = hpsk.bigdata01.com
collect.sources.s1.port = 45454
#define channel
collect.channels.c1.type = memory
collect.channels.c1.capacity = 1000
collect.channels.c1.transactionCapacity = 100
#define sink
collect.sinks.k1.type = hdfs
#引用自定义的拦截器的key的变量名字filename
collect.sinks.k1.hdfs.path = /flume/interceptors/%{filename}
#bond
collect.sources.s1.channels = c1
collect.sinks.k1.channel = c1
-》source拦截器:对采集的数据实现过滤、在event的头部封装对应keyvalue
Flume Interceptors:
Timestamp Interceptor:时间戳拦截器
在event头部添加一个keyvalue
key:timestamp
value:该event的生成的时间
a1.sources.s1.interceptors = i1 #拦截器的名字叫i1
a1.sources.s1.interceptors.i1.type = timestamp #拦截器的类型
Host Interceptor:主机名拦截器
在event头部添加一个keyvalue
key:host
value:该event生成的机器的主机名
Static Interceptor:自定义拦截器
在event头部添加一个keyvalue
key和value都自定义
Regex Filtering Interceptor:对数据实现过滤
a1.sources.s1.interceptors = i1
a1.sources.s1.interceptors.i1.type = regex_filter
a1.sources.s1.interceptors.i1.regex = (\\d):(\\d):(\\d)
如果该行数据符合正则,就会被封装成event
-》channel选择器
Replicating Channel Selector (default)
将source的数据给每个Channel发一份
Multiplexing Channel Selector:按照规则将数据给不同的channel
a1.sources = r1
a1.channels = c1 c2 c3 c4
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = key
a1.sources.r1.selector.mapping.value1 = c1
a1.sources.r1.selector.mapping.value2 = c2 c3
a1.sources.r1.selector.default = c4
-》sink处理器(必用):构建sinkGroup ,将多个sink放入统一group
-》故障转移:Failover
sink1:正常工作的
sink2:备份(standby)
a1.sinkgroups = g1 #定义一个sinkgroups
a1.sinkgroups.g1.sinks = k1 k2 #把定义好的两个sink 放到sinkgroups中
a1.sinkgroups.g1.processor.type = failover # 故障转移
a1.sinkgroups.g1.processor.priority.k1 = 5 # 设置权重
a1.sinkgroups.g1.processor.priority.k2 = 10 # 设置权重
a1.sinkgroups.g1.processor.maxpenalty = 10000
权重最高的优先执行
-》负载均衡:load_balance
sink1 sink2:两个一起工作
a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = load_balance
# 负载均衡包含了故障转移,常用这种配置
案例1
#################################define the agent###########################
agent.sources = s1
agent.channels = c1
agent.sinks = k1 k2
#define source s1
agent.sources.s1.type = exec
agent.sources.s1.command = tail -f /export/servers/hive-1.1.0-cdh5.14.0/logs/hive.log
#define channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
agent.channels.c1.transactionCapacity = 100
#define sink
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = hpsk.bigdata01.com
agent.sinks.k1.port = 45454
agent.sinks.k2.type = avro
agent.sinks.k2.hostname = hpsk.bigdata02.com
agent.sinks.k2.port = 45454
a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = failover
a1.sinkgroups.g1.processor.priority.k1 = 10
a1.sinkgroups.g1.processor.priority.k2 = 5
a1.sinkgroups.g1.processor.maxpenalty = 100
#bond
agent.sources.s1.channels = c1
agent.sinks.k1.channel = c1
agent.sinks.k2.channel = c1
#################################define the collect###########################
collect.sources = s1
collect.channels = c1
collect.sinks = k1
#define source s1
collect.sources.s1.type = avro
#在另一台启动注意修改主机
collect.sources.s1.bind = hpsk.bigdata01.com
collect.sources.s1.port = 45454
#define channel
collect.channels.c1.type = memory
collect.channels.c1.capacity = 1000
collect.channels.c1.transactionCapacity = 100
#define sink
collect.sinks.k1.type = hdfs
collect.sinks.k1.hdfs.path = /flume/failover
#bond
collect.sources.s1.channels = c1
collect.sinks.k1.channel = c1