关于flume的采集数据源类型、channel的类型、拦截器、选择器使用

最新推荐文章于 2023-04-15 20:47:20 发布

qq_38655865

最新推荐文章于 2023-04-15 20:47:20 发布

阅读量1.2k

点赞数

分类专栏： flume

本文链接：https://blog.csdn.net/qq_38655865/article/details/99064742

版权

flume 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

sources

exec类型执行一条命令

a1.sources.s1.type = exec
a1.sources.s1.command = tail -f /export/datas/shells/access.log

channel

memory类型存到内存中

agent.channels.c1.type = memory
#channel的容量最多能存多少个event
agent.channels.c1.capacity = 1000
#单次读取event的个数
agent.channels.c1.transactionCapacity = 100

sink

#avro类型将数据发送到配置的主机名/端口
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = kak01
agent.sinks.k1.port = 45454

拦截器

#################################define the agent###########################
agent.sources = s1 s2
agent.channels = c1
agent.sinks = k1

#define source s1
agent.sources.s1.type = exec
agent.sources.s1.command = tail -f /export/servers/hive-1.1.0-cdh5.14.0/logs/hive.log

agent.sources.s2.type = exec
agent.sources.s2.command = tail -f /export/datas/flume.txt

#add interceptor
agent.sources.s1.interceptors = i1
agent.sources.s1.interceptors.i1.type = static
agent.sources.s1.interceptors.i1.key = filename
agent.sources.s1.interceptors.i1.value = hive

agent.sources.s2.interceptors = i1
agent.sources.s2.interceptors.i1.type = static
agent.sources.s2.interceptors.i1.key = filename
agent.sources.s2.interceptors.i1.value = flume

#define channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
agent.channels.c1.transactionCapacity = 100

#define sink
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = hpsk.bigdata01.com
agent.sinks.k1.port = 45454

#bond
agent.sources.s1.channels = c1
agent.sources.s2.channels = c1
agent.sinks.k1.channel = c1

#################################define the collect###########################
collect.sources = s1
collect.channels = c1
collect.sinks = k1

#define source s1
collect.sources.s1.type = avro
collect.sources.s1.bind = hpsk.bigdata01.com
collect.sources.s1.port = 45454

#define channel
collect.channels.c1.type = memory
collect.channels.c1.capacity = 1000
collect.channels.c1.transactionCapacity = 100

#define sink
collect.sinks.k1.type = hdfs
#引用自定义的拦截器的key的变量名字filename
collect.sinks.k1.hdfs.path = /flume/interceptors/%{filename}

#bond
collect.sources.s1.channels = c1
collect.sinks.k1.channel = c1

-》source拦截器：对采集的数据实现过滤、在event的头部封装对应keyvalue

		Flume Interceptors：
			Timestamp Interceptor：时间戳拦截器
				在event头部添加一个keyvalue
					key:timestamp
					value：该event的生成的时间
					
				a1.sources.s1.interceptors = i1   #拦截器的名字叫i1
				a1.sources.s1.interceptors.i1.type = timestamp   #拦截器的类型
			Host Interceptor：主机名拦截器
				在event头部添加一个keyvalue
					key:host
					value：该event生成的机器的主机名
			Static Interceptor：自定义拦截器
				在event头部添加一个keyvalue
					key和value都自定义
					
			Regex Filtering Interceptor：对数据实现过滤
				a1.sources.s1.interceptors = i1
				a1.sources.s1.interceptors.i1.type = regex_filter
				a1.sources.s1.interceptors.i1.regex = (\\d):(\\d):(\\d)
				如果该行数据符合正则，就会被封装成event

-》channel选择器

		Replicating Channel Selector (default)
			将source的数据给每个Channel发一份
		Multiplexing Channel Selector：按照规则将数据给不同的channel
			a1.sources = r1
			a1.channels = c1 c2 c3 c4
			a1.sources.r1.selector.type = multiplexing
			a1.sources.r1.selector.header = key
			a1.sources.r1.selector.mapping.value1 = c1
			a1.sources.r1.selector.mapping.value2 = c2 c3
			a1.sources.r1.selector.default = c4

-》sink处理器（必用）：构建sinkGroup ，将多个sink放入统一group

		-》故障转移：Failover 
			sink1：正常工作的  
			sink2：备份（standby）
			
			a1.sinkgroups = g1    #定义一个sinkgroups
			a1.sinkgroups.g1.sinks = k1 k2  #把定义好的两个sink 放到sinkgroups中
			a1.sinkgroups.g1.processor.type = failover  # 故障转移
			a1.sinkgroups.g1.processor.priority.k1 = 5  # 设置权重
			a1.sinkgroups.g1.processor.priority.k2 = 10 # 设置权重
			a1.sinkgroups.g1.processor.maxpenalty = 10000
			
			权重最高的优先执行
			
		-》负载均衡：load_balance
			sink1		sink2：两个一起工作
			a1.sinkgroups = g1
			a1.sinkgroups.g1.sinks = k1 k2
			a1.sinkgroups.g1.processor.type = load_balance
		   
		   # 负载均衡包含了故障转移，常用这种配置

案例1

#################################define the agent###########################
agent.sources = s1
agent.channels = c1
agent.sinks = k1 k2

#define source s1
agent.sources.s1.type = exec
agent.sources.s1.command = tail -f /export/servers/hive-1.1.0-cdh5.14.0/logs/hive.log

#define channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
agent.channels.c1.transactionCapacity = 100

#define sink
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = hpsk.bigdata01.com
agent.sinks.k1.port = 45454

agent.sinks.k2.type = avro
agent.sinks.k2.hostname = hpsk.bigdata02.com
agent.sinks.k2.port = 45454

a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = failover
a1.sinkgroups.g1.processor.priority.k1 = 10
a1.sinkgroups.g1.processor.priority.k2 = 5
a1.sinkgroups.g1.processor.maxpenalty = 100

#bond
agent.sources.s1.channels = c1
agent.sinks.k1.channel = c1
agent.sinks.k2.channel = c1

#################################define the collect###########################
collect.sources = s1
collect.channels = c1
collect.sinks = k1

#define source s1
collect.sources.s1.type = avro
#在另一台启动注意修改主机
collect.sources.s1.bind = hpsk.bigdata01.com
collect.sources.s1.port = 45454

#define channel
collect.channels.c1.type = memory
collect.channels.c1.capacity = 1000
collect.channels.c1.transactionCapacity = 100

#define sink
collect.sinks.k1.type = hdfs
collect.sinks.k1.hdfs.path = /flume/failover

#bond
collect.sources.s1.channels = c1
collect.sinks.k1.channel = c1