
本文详细介绍了Flume在处理日志时的配置和使用,包括使用memory channel读取Hive日志,file channel的配置,HDFS sink的设置以及文件名、大小和分区的管理。同时,讲解了spooling dir source和taildir的监控功能,展示了如何实现扇出(fan out)操作,将数据发送到多个sink。参考了相关技术博客进行深入探讨。
2、选择mem channel
a1.sources = s1
a1.channels = c1
a1.sinks = k1

需求:source:读hive日志  channel:mem  sink:log

【memory channel】


# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
a1.sources.s1.type = exec
a1.sources.s1.command=tail -f /opt/cdh5/hive-0.13.1-cdh5.3.6/logs/hive.log
# The channel can be defined as follows.
a1.sources.s1.channels = c1

# Each sink's type must be defined
a1.sinks.k1.type = logger

#Specify the channel the sink should use
a1.sinks.k1.channel = c1

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 100
a1.channels.c1.transactionCapacity = 100

执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/hive-mem-log.properties -Dflume.root.logger=INFO,console


【file channel】


# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
a1.sources.s1.type = exec
a1.sources.s1.command=tail -f /opt/cdh5/hive-0.13.1-cdh5.3.6/logs/hive.log
# The channel can be defined as follows.
a1.sources.s1.channels = c1

# Each sink's type must be defined
a1.sinks.k1.type = logger

#Specify the channel the sink should use
a1.sinks.k1.channel = c1

# Each channel's type is defined.
a1.channels.c1.type = file

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.checkpointDir = /opt/datas/flume/file/check
a1.channels.c1.dataDirs = /opt/datas/flume/file/data

执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/hive-file-log.properties -Dflume.root.logger=INFO,console

【hdfs sink】

# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
a1.sources.s1.type = exec
a1.sources.s1.command=tail -f /opt/cdh5/hive-0.13.1-cdh5.3.6/logs/hive.log
# The channel can be defined as follows.
a1.sources.s1.channels = c1

# Each sink's type must be defined
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/event/hdfs
a1.sinks.k1.hdfs.filePrefix = hive-log
#Specify the channel the sink should use
a1.sinks.k1.channel = c1

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/hive-hdfs-mem-log.properties -Dflume.root.logger=INFO,console


 hive-size-mem-log.properties 文件内容如下:

# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
a1.sources.s1.type = exec
a1.sources.s1.command=tail -f /opt/cdh5/hive-0.13.1-cdh5.3.6/logs/hive.log
# The channel can be defined as follows.
a1.sources.s1.channels = c1

# Each sink's type must be defined
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/event/size01
a1.sinks.k1.hdfs.filePrefix = hive-log
#指定一个文件的大小 10KB
a1.sinks.k1.hdfs.rollSize = 10240
a1.sinks.k1.hdfs.rollInterval = 0
#指定每个文件中events的数量 默认10个event为一个文件
a1.sinks.k1.hdfs.rollCount = 0
#Specify the channel the sink should use
a1.sinks.k1.channel = c1

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000

执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/hive-size-mem-log.properties -Dflume.root.logger=INFO,console


# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
a1.sources.s1.type = exec
a1.sources.s1.command=tail -f /opt/cdh5/hive-0.13.1-cdh5.3.6/logs/hive.log
# The channel can be defined as follows.
a1.sources.s1.channels = c1

# Each sink's type must be defined
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/event/date=%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = hive-log
a1.sinks.k1.hdfs.rollSize = 10240
a1.sinks.k1.hdfs.rollInterval = 0
a1.sinks.k1.hdfs.rollCount = 0
#必须指定时间标准用于 event的head头部 指定用linux本地时间作为时间搓
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#Specify the channel the sink should use
a1.sinks.k1.channel = c1

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/hive-part-mem-log.properties -Dflume.root.logger=INFO,console

【spooling dir source 】监控文件夹

# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
a1.sources.s1.type = spooldir
a1.sources.s1.spoolDir = /opt/datas/flume/spooling/
a1.sources.s1.ignorePattern = ([^ ]*\.tmp$)
# The channel can be defined as follows.
a1.sources.s1.channels = c1

# Each sink's type must be defined
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/event/dirs
a1.sinks.k1.hdfs.filePrefix = hive-log
a1.sinks.k1.hdfs.rollSize = 10240
a1.sinks.k1.hdfs.rollInterval = 0
a1.sinks.k1.hdfs.rollCount = 0
#Specify the channel the sink should use
a1.sinks.k1.channel = c1

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/dir-mem-log.properties -Dflume.root.logger=INFO,console

【taildir 】既可以监控文件夹 也可以监控文件(按行读取文件夹文件中的内容可以实现递归读取,并提供数据恢复)


# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
#目前版本为1.5编译的版本 1.7的版本才有taildir的类型 所以必须指定调用的具体类
a1.sources.s1.type = org.apache.flume.source.taildir.TaildirSource
#taildir_position.json该文件名不可改 该文件记录了文件读取的位置 避免下次启动进程的时候重新读取之前读取过的数据 进程启动的时候先加载这个json文件
a1.sources.s1.positionFile = /opt/datas/flume/taildir/position/taildir_position.json
a1.sources.s1.filegroups = f1 f2
a1.sources.s1.filegroups.f1 = /opt/datas/flume/taildir/hadoop14.txt
a1.sources.s1.headers.f1.headerKey1 = value1
a1.sources.s1.filegroups.f2 = /opt/datas/flume/taildir/hadoop/.*
a1.sources.s1.headers.f2.headerKey1 = value2
a1.sources.s1.headers.f2.headerKey2 = value2-2

# The channel can be defined as follows.
a1.sources.s1.channels = c1
a1.sinks.k1.channel = c1

# Each sink's type must be defined
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/event/taildir
a1.sinks.k1.hdfs.filePrefix = hive-log
#Specify the channel the sink should use

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/taildir.properties -Dflume.root.logger=INFO,console

在flume中有时候需要将一个源(source)将数据发送到多个地方(sink),在flume中该术语叫做扇出(fan out),也就是从一个source向多个channel,就是向多个sink传递事件,一个sink对应一个channel

sink.properties 文件内容如下:

# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'

a1.sources = s1
a1.channels = c1 c2
a1.sinks = k1 k2

# For each one of the sources, the type is defined
a1.sources.s1.type = exec
a1.sources.s1.command=tail -f /opt/cdh5/hive-0.13.1-cdh5.3.6/logs/hive.log
# The channel can be defined as follows.
a1.sources.s1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

# Each sink's type must be defined
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/event/hdfs1
a1.sinks.k1.hdfs.filePrefix = hive-log1

a1.sinks.k2.type = hdfs
a1.sinks.k2.hdfs.path = /flume/event/hdfs2
a1.sinks.k2.hdfs.filePrefix = hive-log2
#Specify the channel the sink should use

# Each channel's type is defined.
a1.channels.c1.type = memory
a1.channels.c2.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000

a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 1000
执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/sink.properties -Dflume.root.logger=INFO,console

先运行collect再运行client 防止数据丢失


# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'
a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
#指定汇总端的source类型 跟客户端的sink类型一致
a1.sources.s1.type = avro
a1.sources.s1.bind =
a1.sources.s1.port = 45454
# The channel can be defined as follows.
a1.sources.s1.channels = c1
a1.sinks.k1.channel = c1

# Each sink's type must be defined
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/event/avro
a1.sinks.k1.hdfs.filePrefix = hive-log
a1.sinks.k1.hdfs.rollSize = 10240
a1.sinks.k1.hdfs.rollInterval = 0
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#Specify the channel the sink should use

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/avro-collect.properties -Dflume.root.logger=INFO,console


# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per a1, 
# in this case called 'a1'
a1.sources = s1
a1.channels = c1
a1.sinks = k1

# For each one of the sources, the type is defined
a1.sources.s1.type = exec
a1.sources.s1.command=tail -f /opt/cdh5/hive-0.13.1-cdh5.3.6/logs/hive.log
# The channel can be defined as follows.
a1.sources.s1.channels = c1
a1.sinks.k1.channel = c1

# Each sink's type must be defined
a1.sinks.k1.type = avro
a1.sinks.k1.hostname =
a1.sinks.k1.port = 45454
#Specify the channel the sink should use

# Each channel's type is defined.
a1.channels.c1.type = memory

# Other config values specific to each type of channel(sink or source)
# can be defined as well
# In this case, it specifies the capacity of the memory channel
a1.channels.c1.capacity = 100
a1.channels.c1.transactionCapacity = 100
执行命令:bin/flume-ng agent --conf conf/ --name a1 --conf-file conf/avro-client.properties -Dflume.root.logger=INFO,console






当前余额3.43前往充值 >
领取后你会自动成为博主和红包主的粉丝 规则
钱包余额 0


