工具准备
- 一部安装好的linux服务器
- flume压缩包 apache-flume-1.8.0-bin.tar.gz
安装步骤
- 上传flume压缩包 /opt/software/flume
- 解压
tar -xvf apache-flume-1.8.0-bin.tar.gz - 删除压缩包节省空间
rm apache-flume-1.8.0-bin.tar.gz - 修改flume的配置文件.
将flume-env.sh.template重命名为flume-env.sh
mv flume-env.sh.template flume-env.sh
修改flume-env.sh配置文件
解注释JAVA_HOME,并修改jdk路径
- 配置环境变量
export FLUME_HOME=/opt/software/flume/apache-flume-1.8.0-bin
export PATH=$PATH:$FLUME_HOME/bin
使用flume
- 在根目录下创建一个文件夹叫flume,创建一个.conf文件,用来定义一个agent. 所谓agent就是flume的执行单元。
访问flume官网
http://flume.apache.org/
复制这个例子于.conf文件中
做出一番修改:
监听文件夹
#example.conf: A single-node Flume configuration
#为agent上的组件sources,sinks,channels取别名
#sources:需要监听的对象
#channels:指通道,用于连接sources和sinks
#sinks:指输出
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#指定监听的类型
a1.sources.r1.type = spooldir
#指定监听的对象
a1.sources.r1.spoolDir=/logs
#指定监听的消息输出到哪里logger为控制台
a1.sinks.k1.type = logger
#设置通道
#内存
a1.channels.c1.type = memory
#内存容量为1000kb
a1.channels.c1.capacity = 1000
#内存事务回滚
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
监听文件
#为agent上的组件sources,sinks,channels取别名
#sources:需要监听的对象
#channels:指通道,用于连接sources和sinks
#sinks:指输出
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#指定监听的类型
a1.sources.r1.type = exec
#指定监听的对象
a1.sources.r1.command=tail -F /a.txt
#指定监听的消息输出到哪里logger为控制台
a1.sinks.k1.type = logger
#设置通道
#内存
a1.channels.c1.type = memory
#内存容量为1000kb
a1.channels.c1.capacity = 1000
#内存事务回滚
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
执行flume
flume-ng agent --conf /opt/software/flume/apache-flume-1.8.0-bin/conf --conf-file /flume/flume2.conf --name a1 -Dflume.root.logger=INFO,console
采集一个flume数据到达HDFS
在/flume下创建一个.conf文件
##channels:指通道,用于连接sources和sinkssinks:指输出
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#指定监听的类型
a1.sources.r1.type = exec
#指定监听的对象
a1.sources.r1.command=tail -F /a.txt
#定义拦截器,为消息添加时间戳
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = org.apache.flume.interceptor.TimestampInterceptor$Builder
#指定监听的消息输出到hdfs
a1.sinks.k1.type = hdfs
#指定上传的hdfs路径
a1.sinks.k1.hdfs.path = hdfs://Centos161:9000/%Y%m%d
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.fileType = DataStream
#不按照条数生成文件
a1.sinks.k1.hdfs.rollCount = 0
#HDFS上的文件达到128M时生成一个文件
a1.sinks.k1.hdfs.rollSize = 134217728
#HDFS上的文件达到60秒生成一个文件
a1.sinks.k1.hdfs.rollInterval = 60
#设置通道
#内存
a1.channels.c1.type = memory
#内存容量为1000kb
a1.channels.c1.capacity = 1000
#内存事务回滚
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
采集多个flume数据到hdfs上
在Centos161和Centos162机子上创建agent
#为agent上的组件sources,sinks,channels取别名
#sources:需要监听的对象
#channels:指通道,用于连接sources和sinks
#sinks:指输出
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#指定监听的类型
a1.sources.r1.type = spooldir
#指定监听的对象
a1.sources.r1.spoolDir=/logs
#指定监听的消息输出到哪里logger为控制台
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = Centos163
a1.sinks.k1.port = 4545
#设置通道
#内存
a1.channels.c1.type = memory
#内存容量为1000kb
a1.channels.c1.capacity = 1000
#内存事务回滚
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
在centos163上创建agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = avro
a1.sources.r1.bind = Centos163
a1.sources.r1.port = 4545
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = org.apache.flume.interceptor.TimestampInterceptor$Builder
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://Centos161:9000/flume/%Y%m%d
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollInterval = 6
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
先运行Centos163,再运行Centos162和Centos161