flume的基本使用

1 flume的基本运行方式

1.1 通过端口来进行获取流
flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name NetCat \
--conf-file $FLUME_HOME/jobconf/flume-telnet.conf \
-Dflume.root.logger==INFO,console

输入类型:netcat(端口监听)

输出类型:logs端口打印(#号备注了连接到本地文件)

sources	--->	type=netcat		bind=bigdata:112	port=44445
sinks	--->	type=logger(# file_roll)
channels--->	type = memory	capacity = 1000		transactionCapacity = 100
#1.定义Agent=>NetCat
NetCat.sources=netcatSources
NetCat.sinks = netcatSinks
NetCat.channels = netcatChannels

#2.定义source
NetCat.sources.netcatSources.type = netcat
NetCat.sources.netcatSources.bind = bigdata112
NetCat.sources.netcatSources.port = 44445

#3.定义sink
NetCat.sinks.netcatSinks.type = logger

# 连接到本地文件夹
# NetCat.sinks.netcatSinks.type = file_roll
# 备注:此处的文件夹需要先创建好
# NetCat.sinks.netcatSinks.sink.directory = /opt/test/flume3

#4.定义channel
NetCat.channels.netcatChannels.type = memory
NetCat.channels.netcatChannels.capacity = 1000
NetCat.channels.netcatChannels.transactionCapacity = 100

#5.双向链接
NetCat.sources.netcatSources.channels = netcatChannels
NetCat.sinks.netcatSinks.channel = netcatChannels
1.2 通过监控文件来进行获取流
flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name Exec \
--conf-file $FLUME_HOME/jobconf/flume-hdfs.conf \
-Dflume.root.logger==INFO,console

输入类型:exec(端口文件)

输出类型:HDFS

sources	--->	type=exec	command = tail -F /opt/Andy		shell = /bin/bash -c
sinks	--->	type=hdfs	path = hdfs://bigdata111:9000/flume/%H	...
channels--->	type = memory	capacity = 1000		transactionCapacity = 1000
#1.定义Agent=>Exec
Exec.sources=execSources
Exec.sinks = execSinks
Exec.channels = execChannels

#2.定义source
Exec.sources.execSources.type = exec
Exec.sources.execSources.command = tail -F /opt/Andy
Exec.sources.execSources.shell = /bin/bash -c

#3.定义sink
Exec.sinks.execSinks.type = hdfs
Exec.sinks.execSinks.hdfs.path = hdfs://bigdata111:9000/flume/%H
#上传文件的前缀
Exec.sinks.execSinks.hdfs.filePrefix = logs-
#是否按照时间滚动文件夹
Exec.sinks.execSinks.hdfs.round = true
#多少时间单位创建一个新的文件夹
Exec.sinks.execSinks.hdfs.roundValue = 1
#重新定义时间单位
Exec.sinks.execSinks.hdfs.roundUnit = hour
#是否使用本地时间戳
Exec.sinks.execSinks.hdfs.useLocalTimeStamp = true
#积攒多少个Event才flush到HDFS一次
Exec.sinks.execSinks.hdfs.batchSize = 100
#设置文件类型,可支持压缩
Exec.sinks.execSinks.hdfs.fileType = DataStream
#多久生成一个新的文件
Exec.sinks.execSinks.hdfs.rollInterval = 600
#设置每个文件的滚动大小
Exec.sinks.execSinks.hdfs.rollSize = 134217728
#文件的滚动与Event数量无关
Exec.sinks.execSinks.hdfs.rollCount = 0
#最小副本数
Exec.sinks.execSinks.hdfs.minBlockReplicas = 1

#4.定义channel
Exec.channels.execChannels.type = memory
Exec.channels.execChannels.capacity = 1000
Exec.channels.execChannels.transactionCapacity = 1000

#5.双向链接
Exec.sources.execSources.channels = execChannels
Exec.sinks.execSinks.channel = execChannels
1.3 通过监控文件夹
flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name Spooldir \
--conf-file $FLUME_HOME/jobconf/flume-spooldir.conf \
-Dflume.root.logger==INFO,console
sources	--->	type=spooldir
sinks	--->	type=hdfs	path = hdfs://bigdata111:9000/flume/%H	...
channels--->	type = memory	capacity = 1000		transactionCapacity = 1000

输入类型:spooldir(端口文件夹)

输出类型:HDFS

#1.定义Agent=>Spooldir
Spooldir.sources=spooldirSources
Spooldir.sinks = spooldirSinks
Spooldir.channels = spooldirChannels

#2.定义source
Spooldir.sources.spooldirSources.type = spooldir
Spooldir.sources.spooldirSources.spoolDir = /opt/module/apache-flume-1.9.0-bin/flume_logs
Spooldir.sources.spooldirSources.fileSuffix = .COMPLETED
Spooldir.sources.spooldirSources.fileHeader = true
#忽略所有以.tmp结尾的文件,不上传
Spooldir.sources.spooldirSources.ignorePattern = ([^ ]*\.tmp)

#3.定义sink
Spooldir.sinks.spooldirSinks.type = hdfs
Spooldir.sinks.spooldirSinks.hdfs.path = hdfs://bigdata111:9000/flume/%H
#上传文件的前缀
Spooldir.sinks.spooldirSinks.hdfs.filePrefix = upload-
#是否按照时间滚动文件夹
Spooldir.sinks.spooldirSinks.hdfs.round = true
#多少时间单位创建一个新的文件夹
Spooldir.sinks.spooldirSinks.hdfs.roundValue = 1
#重新定义时间单位
Spooldir.sinks.spooldirSinks.hdfs.roundUnit = hour
#是否使用本地时间戳
Spooldir.sinks.spooldirSinks.hdfs.useLocalTimeStamp = true
#积攒多少个Event才flush到HDFS一次
Spooldir.sinks.spooldirSinks.hdfs.batchSize = 1000
#设置文件类型,可支持压缩
Spooldir.sinks.spooldirSinks.hdfs.fileType = DataStream
#多久生成一个新的文件
Spooldir.sinks.spooldirSinks.hdfs.rollInterval = 600
#设置每个文件的滚动大小
Spooldir.sinks.spooldirSinks.hdfs.rollSize = 134217728
#文件的滚动与Event数量无关
Spooldir.sinks.spooldirSinks.hdfs.rollCount = 0
#最小副本数
Spooldir.sinks.spooldirSinks.hdfs.minBlockReplicas = 1

#4.定义channel
Spooldir.channels.spooldirChannels.type = memory
Spooldir.channels.spooldirChannels.capacity = 1000
Spooldir.channels.spooldirChannels.transactionCapacity = 1000

#5.双向链接
Spooldir.sources.spooldirSources.channels = spooldirChannels
Spooldir.sinks.spooldirSinks.channel = spooldirChannels

2 flume的多对一和一对多

2.1 多对一,ManyToOne

flume1(监听文件),flume2(监听端口)-通过avro使得数据输出到统一端口-> flume3(两个合并输出到hdfs)

flume1

监听文件

flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name ManyToOne1 \
--conf-file $FLUME_HOME/jobconf/ManyToOne/flume1.conf \
-Dflume.root.logger==INFO,console

输入类型:文件监听

输出类型:avro(通过指定端口号来对应下一个flume输入来进行链接)

# agent==>ManyToOne1
ManyToOne1.sources =execSources
ManyToOne1.sinks = execSinks
ManyToOne1.channels = execChannels

# Describe/configure the source
ManyToOne1.sources.execSources.type = exec
ManyToOne1.sources.execSources.command = tail -F /opt/Andy
ManyToOne1.sources.execSources.shell = /bin/bash -c

# Describe the sink
ManyToOne1.sinks.execSinks.type = avro
ManyToOne1.sinks.execSinks.hostname = bigdata111
ManyToOne1.sinks.execSinks.port = 4141

# Describe the channel
ManyToOne1.channels.execChannels.type = memory
ManyToOne1.channels.execChannels.capacity = 1000
ManyToOne1.channels.execChannels.transactionCapacity = 100

# Bind the source and sink to the channel
ManyToOne1.sources.execSources.channels = execChannels
ManyToOne1.sinks.execSinks.channel = execChannels

flume2

监听端口

flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name ManyToOne2 \
--conf-file $FLUME_HOME/jobconf/ManyToOne/flume2.conf \
-Dflume.root.logger==INFO,console

输入类型:netcat(端口)

输出类型:avro(通过指定端口号来对应下一个flume输入来进行链接)

#1.定义Agent=>ManyToOne2
ManyToOne2.sources=netcatSources
ManyToOne2.sinks = netcatSinks
ManyToOne2.channels = netcatChannels

#2.定义source
ManyToOne2.sources.netcatSources.type = netcat
ManyToOne2.sources.netcatSources.bind = bigdata111
ManyToOne2.sources.netcatSources.port = 44445

# 通过avro输出流
ManyToOne2.sinks.netcatSinks.type = avro
ManyToOne2.sinks.netcatSinks.hostname = bigdata111
ManyToOne2.sinks.netcatSinks.port = 4141

# Describe the channel
ManyToOne2.channels.netcatChannels.type = memory
ManyToOne2.channels.netcatChannels.capacity = 1000
ManyToOne2.channels.netcatChannels.transactionCapacity = 100

# Bind the source and sink to the channel
ManyToOne2.sources.netcatSources.channels = netcatChannels
ManyToOne2.sinks.netcatSinks.channel = netcatChannels
flume3

聚合

flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name ManyToOne3 \
--conf-file $FLUME_HOME/jobconf/ManyToOne/flume3.conf \
-Dflume.root.logger==INFO,console

输入类型:avro端口

输出类型:HDFS

# 1.定义Agent=>ManyToOne3
ManyToOne3.sources = many
ManyToOne3.sinks = one
ManyToOne3.channels = to

# Describe/configure the source
ManyToOne3.sources.many.type = avro
ManyToOne3.sources.many.bind = bigdata111
ManyToOne3.sources.many.port = 4141

# Describe the sink
ManyToOne3.sinks.one.type = hdfs
ManyToOne3.sinks.one.hdfs.path = hdfs://bigdata111:9000/flume3/%H
#上传文件的前缀
ManyToOne3.sinks.one.hdfs.filePrefix = flume3-
#是否按照时间滚动文件夹
ManyToOne3.sinks.one.hdfs.round = true
#多少时间单位创建一个新的文件夹
ManyToOne3.sinks.one.hdfs.roundValue = 1
#重新定义时间单位
ManyToOne3.sinks.one.hdfs.roundUnit = hour
#是否使用本地时间戳
ManyToOne3.sinks.one.hdfs.useLocalTimeStamp = true
#积攒多少个Event才flush到HDFS一次
ManyToOne3.sinks.one.hdfs.batchSize = 100
#设置文件类型,可支持压缩
ManyToOne3.sinks.one.hdfs.fileType = DataStream
#多久生成一个新的文件
ManyToOne3.sinks.one.hdfs.rollInterval = 600
#设置每个文件的滚动大小大概是128M
ManyToOne3.sinks.one.hdfs.rollSize = 134217728
#文件的滚动与Event数量无关
ManyToOne3.sinks.one.hdfs.rollCount = 0
#最小冗余数
ManyToOne3.sinks.one.hdfs.minBlockReplicas = 1

# Describe the channel
ManyToOne3.channels.to.type = memory
ManyToOne3.channels.to.capacity = 1000
ManyToOne3.channels.to.transactionCapacity = 100

# Bind the source and sink to the channel
ManyToOne3.sources.many.channels = to
ManyToOne3.sinks.one.channel = to

2.2 一对多,OneToMany

flume1-通过avro输出到两个端口->flume2(AVro端口1),flume3(AVro端口2)

flume1
flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name OneToMany1 \
--conf-file $FLUME_HOME/jobconf/OneToMany/flume1.conf \
-Dflume.root.logger==INFO,console

输入类型:exec(监控文件)

输出类型:多个avro端口

# agent==>OneToMany
OneToMany1.sources = one
OneToMany1.sinks = many1 many2
OneToMany1.channels = to1 to2
# 将数据流复制给多个channel
OneToMany.sources.one.selector.type = replicating

# Describe/configure the source
OneToMany1.sources.one.type = exec
OneToMany1.sources.one.command = tail -F /opt/Andy
OneToMany1.sources.one.shell = /bin/bash -c

# 通过avro序列化模式,端口输出
OneToMany1.sinks.many1.type = avro
OneToMany1.sinks.many1.hostname = bigdata111
OneToMany1.sinks.many1.port = 4141

OneToMany1.sinks.many2.type = avro
OneToMany1.sinks.many2.hostname = bigdata111
OneToMany1.sinks.many2.port = 4142

# Describe the channel
OneToMany1.channels.to1.type = memory
OneToMany1.channels.to1.capacity = 1000
OneToMany1.channels.to1.transactionCapacity = 100

OneToMany1.channels.to2.type = memory
OneToMany1.channels.to2.capacity = 1000
OneToMany1.channels.to2.transactionCapacity = 100

# 通过channels连接sinks和sources
OneToMany1.sources.one.channels = to1 to2
OneToMany1.sinks.many1.channel = to1
OneToMany1.sinks.many2.channel = to2

flume2
flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name OneToMany2 \
--conf-file $FLUME_HOME/jobconf/OneToMany/flume2.conf \
-Dflume.root.logger==INFO,console

输入类型:avro端口

输出类型:hdfs

# agent==>OneToMany2
OneToMany2.sources = avroSource
OneToMany2.sinks = avroSinks
OneToMany2.channels = avroChannels

# 创建avroSource连接4141
OneToMany2.sources.avroSource.type = avro
OneToMany2.sources.avroSource.bind = bigdata111
OneToMany2.sources.avroSource.port = 4141

# Describe the sink
OneToMany2.sinks.avroSinks.type = hdfs
OneToMany2.sinks.avroSinks.hdfs.path = hdfs://bigdata111:9000/flume2/%H
#上传文件的前缀
OneToMany2.sinks.avroSinks.hdfs.filePrefix = flume2-
#是否按照时间滚动文件夹
OneToMany2.sinks.avroSinks.hdfs.round = true
#多少时间单位创建一个新的文件夹
OneToMany2.sinks.avroSinks.hdfs.roundValue = 1
#重新定义时间单位
OneToMany2.sinks.avroSinks.hdfs.roundUnit = hour
#是否使用本地时间戳
OneToMany2.sinks.avroSinks.hdfs.useLocalTimeStamp = true
#积攒多少个Event才flush到HDFS一次
OneToMany2.sinks.avroSinks.hdfs.batchSize = 100
#设置文件类型,可支持压缩
OneToMany2.sinks.avroSinks.hdfs.fileType = DataStream
#多久生成一个新的文件
OneToMany2.sinks.avroSinks.hdfs.rollInterval = 600
#设置每个文件的滚动大小大概是128M
OneToMany2.sinks.avroSinks.hdfs.rollSize = 134217700
#文件的滚动与Event数量无关
OneToMany2.sinks.avroSinks.hdfs.rollCount = 0
#最小副本数
OneToMany2.sinks.avroSinks.hdfs.minBlockReplicas = 1


# Describe the channel
OneToMany2.channels.avroChannels.type = memory
OneToMany2.channels.avroChannels.capacity = 1000
OneToMany2.channels.avroChannels.transactionCapacity = 100

# Bind the source and sink to the channel
OneToMany2.sources.avroSource.channels = avroChannels
OneToMany2.sinks.avroSinks.channel = avroChannels
flume3
flume-ng agent \
--conf $FLUME_HOME/conf/ \
--name OneToMany3 \
--conf-file $FLUME_HOME/jobconf/OneToMany/flume3.conf \
-Dflume.root.logger==INFO,console

输入类型:avro

输出类型:直接到本地文件

# agent==>OneToMany3
OneToMany3.sources = avroSource
OneToMany3.sinks = avroSinks
OneToMany3.channels = avroChannels

# 创建avroSource连接4142
OneToMany3.sources.avroSource.type = avro
OneToMany3.sources.avroSource.bind = bigdata111
OneToMany3.sources.avroSource.port = 4142

# 连接到本地文件夹
OneToMany3.sinks.avroSinks.type = file_roll
#备注:此处的文件夹需要先创建好
OneToMany3.sinks.avroSinks.sink.directory = /opt/test/flume3

# Describe the channel
OneToMany3.channels.avroChannels.type = memory
OneToMany3.channels.avroChannels.capacity = 1000
OneToMany3.channels.avroChannels.transactionCapacity = 100

# Bind the source and sink to the channel
OneToMany3.sources.avroSource.channels = avroChannels
OneToMany3.sinks.avroSinks.channel = avroChannels

3 flume 拦截器(interceptors)

  1. 拦截器是对数据进行操作,不仅仅是在做数据是筛选,删减,还可以增加,而且对也可以对文件名进行操作

  2. 拦截器是在source和channel之间运行

3.1 对文件名添加时间

flume-ng agent \
-n Timestamp \
-f $FLUME_HOME/jobconf/interceptors/flume-Timestamp.conf \
-c $FLUME_HOME/conf \
-Dflume.root.logger=INFO,console

输入:spooldir(监控文件夹)

拦截器:给文件名添加时间轴

输出:HDFS

#定义agent名, source、channel、sink的名称
Timestamp.sources = spooldirSources
Timestamp.channels = spooldirChannels
Timestamp.sinks = spooldirSinks

#具体定义source
Timestamp.sources.spooldirSources.type = spooldir
Timestamp.sources.spooldirSources.spoolDir = /opt/module/apache-flume-1.9.0-bin/flume_logs

#定义拦截器,为文件最后添加时间戳
Timestamp.sources.spooldirSources.interceptors = i1
Timestamp.sources.spooldirSources.interceptors.i1.type = org.apache.flume.interceptor.TimestampstampInterceptor$Builder

#具体定义channel
Timestamp.channels.spooldirChannels.type = memory
Timestamp.channels.spooldirChannels.capacity = 10000
Timestamp.channels.spooldirChannels.transactionCapacity = 100

#具体定义sink
Timestamp.sinks.spooldirSinks.type = hdfs
Timestamp.sinks.spooldirSinks.hdfs.path = hdfs://bigdata111:9000/flume-interceptors/%H
Timestamp.sinks.spooldirSinks.hdfs.filePrefix = events-
Timestamp.sinks.spooldirSinks.hdfs.fileType = DataStream

#不按照条数生成文件
Timestamp.sinks.spooldirSinks.hdfs.rollCount = 0
#HDFS上的文件达到128M时生成一个文件
Timestamp.sinks.spooldirSinks.hdfs.rollSize = 134217728
#HDFS上的文件达到60秒生成一个文件
Timestamp.sinks.spooldirSinks.hdfs.rollInterval = 60

#组装source、channel、sink
Timestamp.sources.spooldirSources.channels = spooldirChannels
Timestamp.sinks.spooldirSinks.channel = spooldirChannels

3.2 文件夹添加主机名

flume-ng agent \
-n Host \
-f $FLUME_HOME/jobconf/interceptors/flume-HostName.conf \
-c $FLUME_HOME/conf \
-Dflume.root.logger=INFO,console

输入:spooldir(监控文件夹)

拦截器:给文件名添加主机名

输出:HDFS

#定义agent名, source、channel、sink的名称
Host.sources = spooldirSources
Host.channels = spooldirChannels
Host.sinks = spooldirSinks

#具体定义source	
Host.sources.spooldirSources.type = spooldir
Host.sources.spooldirSources.spoolDir = /opt/module/apache-flume-1.9.0-bin/flume_logs

#定义拦截器,为文件最后添加时间戳
Host.sources.spooldirSources.interceptors = i1
Host.sources.spooldirSources.interceptors.i1.type = host
Host.sources.spooldirSources.interceptors.i1.useIP = false
Host.sources.spooldirSources.interceptors.i1.hostHeader= agentHost

#具体定义channel
Host.channels.spooldirChannels.type = memory
Host.channels.spooldirChannels.capacity = 1000
Host.channels.spooldirChannels.transactionCapacity = 100

#具体定义sink
Host.sinks.spooldirSinks.type = hdfs
Host.sinks.spooldirSinks.hdfs.path = hdfs://bigdata111:9000/flume-interceptors/%H
#	文件前缀
Host.sinks.spooldirSinks.hdfs.filePrefix =events-%{agentHost}
#	文件后缀
Host.sinks.spooldirSinks.hdfs.fileSuffix =.log
Host.sinks.spooldirSinks.hdfs.fileType = DataStream
Host.sinks.spooldirSinks.hdfs.writeFormat = Text
Host.sinks.spooldirSinks.hdfs.useLocalTimeStamp = true
#不按照条数生成文件
Host.sinks.spooldirSinks.hdfs.rollCount = 0
#HDFS上的文件达到128M时生成一个文件
Host.sinks.spooldirSinks.hdfs.rollSize = 134217728
#HDFS上的文件达到60秒生成一个文件
Host.sinks.spooldirSinks.hdfs.rollInterval = 60

#组装source、channel、sink
Host.sources.spooldirSources.channels = spooldirChannels
Host.sinks.spooldirSinks.channel = spooldirChannels

3.3 给头文件添加 UUID

flume-ng agent \
-n UUID \
-f $FLUME_HOME/jobconf/interceptors/flume-UUID.conf \
-c $FLUME_HOME/conf \
-Dflume.root.logger=INFO,console

输入:exec(监控文件)

拦截器:给数据添加头文件,防止数据倾斜

输出:logs(可以查看到headers中的数据)

#定义agent名, source、channel、sink的名称
UUID.sources = execSources
UUID.channels = execChannels
UUID.sinks = execSinks

#具体定义source
UUID.sources.execSources.type = exec
UUID.sources.execSources.command = tail -F /opt/Andy
UUID.sources.execSources.shell = /bin/bash -c
#定义拦截器,生成UUID头
UUID.sources.execSources.interceptors = i1
UUID.sources.execSources.interceptors.i1.type = org.apache.flume.sink.solr.morphline.UUIDInterceptor$Builder
UUID.sources.execSources.interceptors.i1.preserveExisting = true
UUID.sources.execSources.interceptors.i1.prefix = UUID_

#3.定义sink
UUID.sinks.execSinks.type = logger

#具体定义channel
UUID.channels.execChannels.type = memory
UUID.channels.execChannels.capacity = 1000
UUID.channels.execChannels.transactionCapacity = 100

#组装source、channel、sink
UUID.sources.execSources.channels = execChannels
UUID.sinks.execSinks.channel = execChannels

3.4 正则查询替换拦截器

flume-ng agent \
-n Search \
-f $FLUME_HOME/jobconf/interceptors/flume-Search.conf \
-c $FLUME_HOME/conf \
-Dflume.root.logger=INFO,console

输入:exec(监控文件)

拦截器:匹配文字并替换

输出:logs(以便查看)

#定义agent名, source、channel、sink的名称
Search.sources = execSources
Search.channels = execChannels
Search.sinks = execSinks

#具体定义source
Search.sources.execSources.type = exec
Search.sources.execSources.command = tail -F /opt/Andy
Search.sources.execSources.shell = /bin/bash -c
#定义拦截器,search
Search.sources.execSources.interceptors = i1
Search.sources.execSources.interceptors.i1.type = search_replace
#定义正则,将数字开头的替换为itstar
Search.sources.execSources.interceptors.i1.searchPattern = [0-9]+
Search.sources.execSources.interceptors.i1.replaceString = itstar
Search.sources.execSources.interceptors.i1.charset = UTF-8

#3.定义sink
Search.sinks.execSinks.type = logger

#具体定义channel
Search.channels.execChannels.type = memory
Search.channels.execChannels.capacity = 1000
Search.channels.execChannels.transactionCapacity = 100

#组装source、channel、sink
Search.sources.execSources.channels = execChannels
Search.sinks.execSinks.channel = execChannels

3.5 正则表达式Regex

flume-ng agent \
-n Regex \
-f $FLUME_HOME/jobconf/interceptors/flume-RegexFilter|flume-RegexExtractor.conf \
-c $FLUME_HOME/conf \
-Dflume.root.logger=INFO,console

输入:exec(监控文件)

拦截器:regex_filter(正则表达式,剔除满足正则的数据,将满足正则的数据写入数据中)

输出:logs(以便查看)

#定义agent名, source、channel、sink的名称
Regex.sources = execSources
Regex.channels = execChannels
Regex.sinks = execSinks

#具体定义source
Regex.sources.execSources.type = exec
Regex.sources.execSources.command = tail -F /opt/Andy
Regex.sources.execSources.shell = /bin/bash -c

#定义拦截器,Regex
Regex.sources.execSources.interceptors = i1
Regex.sources.execSources.interceptors.i1.type = regex_filter
#定义正则规则
Regex.sources.execSources.interceptors.i1.regex = ^A.*
Regex.sources.execSources.interceptors.i1.excludeEvents = true

#3.定义sink
Regex.sinks.execSinks.type = logger

#具体定义channel
Regex.channels.execChannels.type = memory
Regex.channels.execChannels.capacity = 1000
Regex.channels.execChannels.transactionCapacity = 100

#组装source、channel、sink
Regex.sources.execSources.channels = execChannels
Regex.sinks.execSinks.channel = execChannels

输入:exec(监控文件)

拦截器:regex_extractor(正则表达式,留下满足正则的,将满足正则的数据写入headers中)

输出:logs(以便查看)

#定义agent名, source、channel、sink的名称
Regex.sources = execSources
Regex.channels = execChannels
Regex.sinks = execSinks

#具体定义source
Regex.sources.execSources.type = exec
Regex.sources.execSources.command = tail -F /opt/Andy
Regex.sources.execSources.shell = /bin/bash -c
#定义拦截器,Regex
Regex.sources.execSources.interceptors = i1
Regex.sources.execSources.interceptors.i1.type = regex_extractor
#定义正则规则
Regex.sources.execSources.interceptors.i1.regex = hostname is (.*?) ip is (.*)
Regex.sources.execSources.interceptors.i1.serializers = s1 s2
Regex.sources.execSources.interceptors.i1.serializers.s1.name = cookieid
Regex.sources.execSources.interceptors.i1.serializers.s2.name = ip
#3.定义sink
Regex.sinks.execSinks.type = logger

#具体定义channel
Regex.channels.execChannels.type = memory
Regex.channels.execChannels.capacity = 1000
Regex.channels.execChannels.transactionCapacity = 100

#组装source、channel、sink
Regex.sources.execSources.channels = execChannels
Regex.sinks.execSinks.channel = execChannels

3.6自定义拦截器

运行

$FLUME_HOME/bin/flume-ng agent \
-c conf/ \
-n myInterceptor \
-f $FLUME_HOME/jobconf/interceptors/flume-myInterceptor.conf \
-C /opt/test/sparkStudy-1.0-SNAPSHOT.jar \
-Dflume.root.logger=DEBUG,console

jar部分

package bigdata12.homework11.flume;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.apache.hadoop.hbase.util.GetJavaProperty;
import java.util.ArrayList;
import java.util.List;

/**
 * flume 自定义拦截器
 * 大小写转换
 * @date 2019/7/3
 * @author Fantome
 */
public class FlumeInterceptor implements Interceptor {

    @Override
    public void initialize() {

    }

    /**
     * 自定义拦截器重新装载
     * @param event 获取数据
     * @return 重新装载数据
     */
    @Override
    public Event intercept(Event event) {
        //获取头数据,为bytes数组
        event.getHeaders();
        //获取文件数据,为bytes数组
        byte[] bodyBytes = event.getBody();
        //bytes 转换为string在转换成大写
        String strBody = new String(bodyBytes).toUpperCase();
        //string 再转换为bytes
        byte[] outBytes = strBody.getBytes();
        //event重新装载数据
        event.setBody(outBytes);
        //将数据传入下一个中进行装载
        return event;
    }

    @Override
    public List<Event> intercept(List<Event> list) {
        //自定义一个list存储数据
        List<Event> list1 =new ArrayList();
        //获取文件,重新装载数据
        for (Event e:list){
            list1.add(intercept(e));
        }
        return list1;
    }

    /**
     * 在conf文件制定的是这个类
     * bigdata12.homework11.flume.FlumeInterceptor$Builder
     */
    public static class Builder implements Interceptor.Builder{
        /**
         * 创建自定义类,并返回
         * @return
         */
        @Override
        public Interceptor build() {
            FlumeInterceptor flumeInterceptor = new FlumeInterceptor();
            return flumeInterceptor;
        }
        @Override
        public void configure(Context context) {

        }
    }
    @Override
    public void close() {

    }
}

conf部分

输入:exec(监控文件)

拦截器:jar引入,(重点)全类名$Builder为重写Interceptor.Builder的类名

输出:HDFS

#1.agent =>myInterceptor
myInterceptor.sources = execSources
myInterceptor.sinks =execSinks
myInterceptor.channels = execChannels
 
# Describe/configure the source
myInterceptor.sources.execSources.type = exec
myInterceptor.sources.execSources.command = tail -F /opt/Andy
myInterceptor.sources.execSources.interceptors = i1

#(重点)全类名$Builder
myInterceptor.sources.execSources.interceptors.i1.type = bigdata12.homework11.flume.FlumeInterceptor$Builder
 
# Describe the sink
myInterceptor.sinks.execSinks.type = hdfs
myInterceptor.sinks.execSinks.hdfs.path = /ToUpCase1
myInterceptor.sinks.execSinks.hdfs.filePrefix = events-
myInterceptor.sinks.execSinks.hdfs.round = true
myInterceptor.sinks.execSinks.hdfs.roundValue = 10
myInterceptor.sinks.execSinks.hdfs.roundUnit = minute
myInterceptor.sinks.execSinks.hdfs.rollInterval = 3
myInterceptor.sinks.execSinks.hdfs.rollSize = 20
myInterceptor.sinks.execSinks.hdfs.rollCount = 5
myInterceptor.sinks.execSinks.hdfs.batchSize = 1
myInterceptor.sinks.execSinks.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是 Sequencefile,可用 DataStream,则为普通文本
myInterceptor.sinks.execSinks.hdfs.fileType = DataStream
 
# Use a channel which buffers events in memory
myInterceptor.channels.execChannels.type = memory
myInterceptor.channels.execChannels.capacity = 1000
myInterceptor.channels.execChannels.transactionCapacity = 100
 
# Bind the source and sink to the channel
myInterceptor.sources.execSources.channels = execChannels
myInterceptor.sinks.execSinks.channel = execChannels
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值