Flume-----八种采集方案

最新推荐文章于 2024-07-17 03:20:51 发布

夜未央，温柔乡

最新推荐文章于 2024-07-17 03:20:51 发布

阅读量954

点赞数 2

分类专栏： Hadoop 文章标签： flume hadoop big data

本文链接：https://blog.csdn.net/GTmustang/article/details/122139179

版权

Hadoop 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

flume八种采集方案

案例1）avro+memory+logger

logger通常用于测试，数据流中的event最终显示在屏幕上
1）采集方案的配置
[root@hadoop01 ~]# mkdir flumeconf
[root@hadoop01 ~]# vim ./flumeconf/avro-mem-logger.properties
#定义三大组件的名称  和关联
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

#定义Source的相关属性
a1.sources.r1.type = avro
#绑定本机的ip或者是hostname
a1.sources.r1.bind = hadoop01 
#要监听的本机上的某一个端口号，  当程序启动时，该端口号就会被使用
a1.sources.r1.port = 10086

#定义channel的相关属性
a1.channels.c1.type = memory
#内存存储容量 event的最大数量
a1.channels.c1.capacity=1000
#从内存中出来时，一次性提交的event的数量
a1.channels.c1.transactionCapacity=100

#定义Sink的相关属性
a1.sinks.k1.type=logger
a1.sinks.k1.maxBytesToLog = 16
2）启动方案
flume-ng agent -c /usr/local/flume/conf -f ./flumeconf/avro-mem-logger.properties -n a1 -Dflume.root.logger=INFO,console
3）测试：因为用的是avro的source，那么必须使用avro-client进行测试
[root@hadoop01 ~]# mkdir flumedata
[root@hadoop01 ~]# echo "hellworld" > flumedata/data.txt
[root@hadoop01 ~]# flume-ng avro-client -c /usr/local/flume/conf/ -H hadoop01 -p 10086 -F ./flumedata/data.txt

案例2）exec+memory+logger

注意：使用exec源，监听的文件，要提前创建
1）采集方案的编写
[root@hadoop01 ~]# vim ./flumeconf/exec-mem-logger.properties
#定义三大组件的名称  和关联
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

#定义Source的相关属性
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F ./flumedata/data.txt

#定义channel的相关属性
a1.channels.c1.type = memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100

#定义Sink的相关属性
a1.sinks.k1.type=logger
a1.sinks.k1.maxBytesToLog = 16
2）启动采集方案
flume-ng agent -f ./flumeconf/exec-mem-logger.properties -n a1 -Dflume.root.logger=INFO,console
3）测试
echo "helloworld" >> ./flumedata/data.txt

案例3）exec+memory+hdfs

1）采集方案的编写
[root@hadoop01 ~]# vim ./flumeconf/exec-mem-hdfs.conf
#定义三大组件的名称  和关联
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

#定义Source的相关属性
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F ./flumedata/data.txt

#定义channel的相关属性
a1.channels.c1.type = memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100

#定义Sink的相关属性
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path = /flume/%Y-%m-%d/%H-%M
a1.sinks.k1.hdfs.filePrefix = wcm
a1.sinks.k1.hdfs.fileSuffix = .wsy
#下面三个条件满足其一，就会产生新文件
#新文件产生的时间周期，单位是秒，   如果设置为0表示不会产生新文件。
a1.sinks.k1.hdfs.rollInterval = 60    
#当前文件达到1000字节，就会产生新文件
a1.sinks.k1.hdfs.rollSize = 1000
#当前文件的event数量达到10条，就会产生新文件
a1.sinks.k1.hdfs.rollCount = 10
#如果writeFormat指定了Text,那么fileType必须是DataStream
a1.sinks.k1.hdfs.writeFormat = Text
a1.sinks.k1.hdfs.fileType = DataStream

#round的作用，用于指定是否滚动文件夹  false 表示不滚动文件夹
a1.sinks.k1.hdfs.round = true      
#设置文件夹滚动的时间单位
a1.sinks.k1.hdfs.roundUnit = minute
#设置文件夹固定的时间数字大小
a1.sinks.k1.hdfs.roundValue = 2
#如果目录上设置了时间格式字符串，比如%Y等，那么下面的属性应该设置为true，除非event的head里有一个叫timestamp的消息头
a1.sinks.k1.hdfs.useLocalTimeStamp = true
2）启动方案
flume-ng agent -f ./flumeconf/exec-mem-hdfs.conf -n a1 -Dflume.root.logger=INFO,console
3）测试
[root@hadoop01 ~]# echo "aaa " >> flumedata/data.txt
[root@hadoop01 ~]# echo "aaa " >> flumedata/data.txt
[root@hadoop01 ~]# echo "aaa " >> flumedata/data.txt
[root@hadoop01 ~]# echo "aaa " >> flumedata/data.txt
[root@hadoop01 ~]# echo "aaa " >> flumedata/data.txt
[root@hadoop01 ~]# echo "aaa " >> flumedata/data.txt

案例4）spool+memory+logger

spool源，是用来监听目录下的新文件的，并通过更名的方式来决定该文件已经采集完。注意，监听的目录必须提前存在。是一个可靠源

exec源不可靠
1）采集方案的编写
[root@hadoop01 ~]# vim flumeconf/spool-mem-logger.properties
#列出每个组件的名称
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

#设置source组件的属性
a1.sources.r1.type=spooldir
#要监听的目录必须提前创建
a1.sources.r1.spoolDir=/root/data/subdir
a1.sources.r1.fileSuffix=.gyy
a1.sources.r1.deletePolicy=never
a1.sources.r1.fileHeader=false
a1.sources.r1.fileHeaderKey=file
a1.sources.r1.basenameHeader=false
a1.sources.r1.basenameHeaderKey=basename
a1.sources.r1.batchSize=100
a1.sources.r1.inputCharset=UTF-8
a1.sources.r1.bufferMaxLines=1000

#设置channel组件的属性
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100

#设置sink组件的属性
a1.sinks.s1.type=logger
a1.sinks.s1.maxBytesToLog=16
2）启动方案
flume-ng agent -f ./flumeconf/spool-mem-logger.properties -n a1 -Dflume.root.logger=INFO,console
3）测试
[root@hadoop01 ~]# echo "helloworld" >>data/subdir/a.txt
[root@hadoop01 ~]# cd data/subdir/
[root@hadoop01 subdir]# ll
总用量 4
-rw-r--r-- 1 root root 11 12月 23 16:25 a.txt.gyy
[root@hadoop01 subdir]# echo "helloworld" >>b.txt
[root@hadoop01 subdir]# echo "helloworld" >>c.txt
[root@hadoop01 subdir]# echo "helloworld" >>d.txt
[root@hadoop01 subdir]# ll
总用量 16
-rw-r--r-- 1 root root 11 12月 23 16:25 a.txt.gyy
-rw-r--r-- 1 root root 11 12月 23 16:25 b.txt.gyy
-rw-r--r-- 1 root root 11 12月 23 16:25 c.txt.gyy
-rw-r--r-- 1 root root 11 12月 23 16:25 d.txt.gyy
注意：因为每次监听都会更名，因此再次监听的文件名不能与之前的名字重复。

案例5）spool+file+hdfs

1）方案的编写
[root@hadoop01 ~]# vim flumeconf/spool-file-hdfs.properties
#命名，并关联
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

#设置spool源
a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir=/root/data/subdir
a1.sources.r1.fileSuffix=.gyy
a1.sources.r1.deletePolicy=never
a1.sources.r1.fileHeader=false
a1.sources.r1.fileHeaderKey=file
a1.sources.r1.basenameHeader=false
a1.sources.r1.basenameHeaderKey=basename
a1.sources.r1.batchSize=100
a1.sources.r1.inputCharset=UTF-8
a1.sources.r1.bufferMaxLines=1000

#设置file的channel
a1.channels.c1.type=file

#设置hdfs的sink
a1.sinks.s1.type=hdfs
a1.sinks.s1.hdfs.path=hdfs://hadoop01:8020/flume/hdfs/%Y
a1.sinks.s1.hdfs.useLocalTimeStamp=true
a1.sinks.s1.hdfs.filePrefix=michael
a1.sinks.s1.hdfs.fileSuffix=.gyy
a1.sinks.s1.hdfs.rollInterval=60
a1.sinks.s1.hdfs.rollSize=1024
a1.sinks.s1.hdfs.rollCount=10
a1.sinks.s1.hdfs.batchSize=100
a1.sinks.s1.hdfs.writeFormat=Text
a1.sinks.s1.hdfs.fileType=DataStream
a1.sinks.s1.hdfs.round=false
a1.sinks.s1.hdfs.roundValue=2      
a1.sinks.s1.hdfs.roundUnit=minute
2）启动采集方案
[root@hadoop01 ~]# flume-ng agent  -f ./flumeconf/spool-file-hdfs.properties -n a1  -Dflume.root.logger=INFO,console
3）测试
[root@hadoop01 subdir]# echo "helloworld" >>e.txt
[root@hadoop01 subdir]# echo "helloworld" >>f.txt
[root@hadoop01 subdir]# echo "helloworld" >>g.txt

案例6）http+memory+logger

1）采集方案的编写
[root@hadoop01 ~]# vim flumeconf/http-mem-logger.properties
#list name of three core
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

#设置每个组件的接口以及属性
a1.sources.r1.type=http
#该源要监听的host或者是ip
a1.sources.r1.bind=hadoop01
#该源要监听的port
a1.sources.r1.port=10086
a1.sources.r1.handler=org.apache.flume.source.http.JSONHandler


a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100

a1.sinks.s1.type=logger
a1.sinks.s1.maxBytesToLog=32
2）启动方案
[root@hadoop01 ~]# flume-ng agent  -f ./flumeconf/http-mem-logger.properties -n a1  -Dflume.root.logger=INFO,console
3）使用curl指令发送post协议进行测试
[root@hadoop03 ~]# curl -X POST -d '[{"headers":{"girlfriend1":"zhangjunning","girlfriend":"nazha"},"body":"they are my girlfriends"}]' http://hadoop01:10086


解析：
-X  用来指定http的请求方式，如post或者是get
-d  用来模拟要发送的数据
第三个参数表示要将数据发送到的地址。

案例7）syslogtcp+memory+logger

1）采集方案的编写
[root@hadoop01 ~]# vim flumeconf/syslogtcp-mem-logger.properties
#list name of three core
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

#设置每个组件的接口以及属性
a1.sources.r1.type=syslogtcp
a1.sources.r1.host=hadoop01
a1.sources.r1.port=10086
a1.sources.r1.eventSize=2500


a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100

a1.sinks.s1.type=logger
a1.sinks.s1.maxBytesToLog=32
2）启动方案
[root@hadoop01 ~]# flume-ng agent  -f ./flumeconf/syslogtcp-mem-logger.properties -n a1  -Dflume.root.logger=INFO,console
3）使用nc指令来发送tcp协议，进行测试
先安装nc指令：yum -y install nmap-ncat
[root@hadoop03 ~]#  echo "helloworld" | nc hadoop01 10086
nc的语法:   
  nc  host  port

案例8）taildir+memory+hdfs
taildir与spooling这两个源的比较

相同点：
1. 都是可靠源
2. 监听的都是目录
3. 该目录一定要提前创建
不同点：
1. spooling监听完的文件会被重命名
2. spooling监听的目录里的文件不能重名
3. spooling监听的是目录里的新文件
4. taildir监听的文件不会被重命名，可以一直监听文件里的新行。

1）采集方案的编写
[root@hadoop01 ~]# vim flumeconf/taildir-mem-hdfs.properties
#命名，并关联
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

#设置spool源
a1.sources.r1.type=TAILDIR
#设置要监听的文件所属组，可以设置多个
a1.sources.r1.filegroups=g1 g2
#规定每一个组要监听的文件的绝对路径，可以使用正则表达式来表示一批文件
a1.sources.r1.filegroups.g1=/root/data/dir2/.*.txt
a1.sources.r1.filegroups.g2=/root/data/dir2/.*.csv
#a1.sources.r1.positionFile=/root/taildir_position.json

#设置channel组件的属性
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100

#设置hdfs的sink
a1.sinks.s1.type=hdfs
a1.sinks.s1.hdfs.path=hdfs://hadoop01:8020/flume/hdfs/%Y-%m
a1.sinks.s1.hdfs.useLocalTimeStamp=true
a1.sinks.s1.hdfs.filePrefix=michael
a1.sinks.s1.hdfs.fileSuffix=.gyy
a1.sinks.s1.hdfs.rollInterval=60
a1.sinks.s1.hdfs.rollSize=1024
a1.sinks.s1.hdfs.rollCount=10
a1.sinks.s1.hdfs.batchSize=100
a1.sinks.s1.hdfs.writeFormat=Text
a1.sinks.s1.hdfs.fileType=DataStream
a1.sinks.s1.hdfs.round=true
a1.sinks.s1.hdfs.roundValue=2      
a1.sinks.s1.hdfs.roundUnit=minute
2）启动方案
[root@hadoop01 ~]# flume-ng agent  -f ./flumeconf/taildir-mem-hdfs.properties -n a1  -Dflume.root.logger=INFO,console
3）测试
目录要提前创建出来
[root@hadoop01 dir2]# echo "chenyun" > a.txt
[root@hadoop01 dir2]# echo "chenyun" >> a.txt
[root@hadoop01 dir2]# echo "chenyun" >> a.csv
[root@hadoop01 dir2]# echo "chenyun" >> a.csv
[root@hadoop01 dir2]# echo "chenyun" >> a.csv
[root@hadoop01 dir2]# echo "chenyun" >> a.csv
[root@hadoop01 dir2]# echo "chenyun" >> a.json   # 不会被采集的