flume

最新推荐文章于 2022-11-11 10:53:20 发布

来自银河系的程序员

最新推荐文章于 2022-11-11 10:53:20 发布

阅读量239

点赞数

分类专栏： Hbase Hive 文章标签： flume 大数据 hadoop

本文链接：https://blog.csdn.net/qq_36894763/article/details/127208311

版权

Hbase 同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

Hive

5 篇文章 0 订阅

订阅专栏

flume

官方用户指导手册： https://flume.apache.org/FlumeUserGuide.html

示例1

使用的是spooldir directory source 和 logger sink 组件

spool-to-logger.properties

agent1.sources = source1
agent1.channels = channel1
agent1.sinks = sink1

# For each one of the sources, the type is defined
agent1.sources.source1.type = spooldir
agent1.sources.source1.spoolDir = /tmp/spooldir
# The channel can be defined as follows.
agent1.sources.source1.channels = channel1

# Each sink's type must be defined
agent1.sinks.sink1.type = logger

#Specify the channel the sink should use
agent1.sinks.sink1.channel = channel1

# Each channel's type is defined.
agent1.channels.channel1.type = file

启动flume代理

flume-ng agent --conf-file spool-to-logger.properties --name agent1 --conf $FLUME_HOME/conf -Dflume.root.logger=INFO,console

新建日志文件

echo "hello flume" > /tmp/spooldir/.file1.txt
mv /tmp/spooldir/.file1.txt /tmp/spooldir/file1.txt

flume处理结果

在这里插入图片描述

再看源文件被source重命名为file1.txt.COMPLETED，这表明Flume已经完成文件的处理，并且对它不会再有任何动作。

HDFS sink

使用的是spooldir directory source 和 HDFS sink 组件

spool-to-hdfs.properties

agent1.sources = source1
agent1.channels = channel1
agent1.sinks = sink1

# For each one of the sources, the type is defined
agent1.sources.source1.type = spooldir
agent1.sources.source1.spoolDir = /tmp/spooldir
# The channel can be defined as follows.
agent1.sources.source1.channels = channel1

# Each sink's type must be defined
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path = /tmp/flume
agent1.sinks.sink1.hdfs.filePrefix = events
agent1.sinks.sink1.hdfs.fileSuffix = .log
agent1.sinks.sink1.hdfs.inUsePrefix = _
agent1.sinks.sink1.hdfs.fileType = DataStream
#Specify the channel the sink should use
agent1.sinks.sink1.channel = channel1

# Each channel's type is defined.
agent1.channels.channel1.type = file

启动flume代理

flume-ng agent --conf-file spool-to-hdfs.properties --name agent1 --conf $FLUME_HOME/conf -Dflume.root.logger=INFO,console

运行出现以下问题
在这里插入图片描述

解决方案：把 Hadoop 中的 guava 复制到 Flume 中

结果如下：
在这里插入图片描述

这一次，事件被传递给HDFS sink 并写到一个文件。对于正在进行写操作处理的文件，其文件名会添加一个后缀“.tmp"，以表明文件处理尚未完成。在本例中，hdfs.inUsePrefix属性被设置为下划线（默认值为空），此举将导致正在进行写操作处理的文件的文件名还要添加一个_（下划线）作为前缀。这样做的是因为MapReduce会忽略以下划线为前缀的文件。因此一个典型的临时文件的文件名为 “_ _events_1399295780136.log.tmp” ，其中的数据是有HDFS生成的时间戳。

分区和拦截器

将上面HDFS sink 的配置改为分区存储方式，只需要对hdfs.path属性进行设置，使之具有使用时间格式转义序列的子目录:

agent1.sinks.sink1.hdfs.path = /tmp/flume/%Y-%m-%d

一个Flume事件将被写入哪个分区是由事件的header中的 timestamp（时间戳）决定的。在默认情况下，事件header中并没有 timestamp ，但是它可以通过Flume拦截器来添加。

拦截器是简单的插件式组件，设置在source和channel之间。source接收到的事件event，在写入channel之前，拦截器都可以进行转换或者删除这些事件。每个拦截器只处理同一个source接收到的事件。可以自定义拦截器

agent1.sources.source1.interceptors = interceptor1
agent1.sources.source1.interceptors.interceptor1.type = timestamp

注意：

如果多层Flume代理，那么事件的创建时间和写入时间之间可能存在明显差异，需要对HDFS sink 的hdfs.useLocalTimeStamp 属性进行设置，以便自由运行HDFS sink 的 flume 代理所产生的时间戳。

agent1.sinks.sink1.hdfs.useLocalTimeStamp = true

文件格式

hdfs.fileType默认为SequenceFile

下面展示一个Avro文件所使用的配置。

spool-to-hdfs-avro.properties

agent1.sources = source1
agent1.channels = channel1
agent1.sinks = sink1

# For each one of the sources, the type is defined
agent1.sources.source1.type = spooldir
agent1.sources.source1.spoolDir = /tmp/spooldir
# The channel can be defined as follows.
agent1.sources.source1.channels = channel1

agent1.sources.source1.interceptors = interceptor1
agent1.sources.source1.interceptors.interceptor1.type = timestamp

# Each sink's type must be defined
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path = /tmp/flume/%Y-%m-%d
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
agent1.sinks.sink1.hdfs.filePrefix = events
agent1.sinks.sink1.hdfs.fileSuffix = .avro
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.serializer = avro_event
agent1.sinks.sink1.serializer.compressionCodec = snappy
#Specify the channel the sink should use
agent1.sinks.sink1.channel = channel1

# Each channel's type is defined.
agent1.channels.channel1.type = file

启动flume代理

flume-ng agent --conf-file spool-to-hdfs-avro.properties --name agent1 --conf $FLUME_HOME/conf -Dflume.root.logger=INFO,console

结果如下：

在这里插入图片描述

多个agent串联

flume-agent1.properties

#tail-avro-avro-logger.conf
# Name the components on this agent
# 定义名称
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
# 监听这个文件
a1.sources.r1.command = tail -F /tmp/spooldir/access_log


# Describe the sink
##sink端的avro是一个数据发送者
a1.sinks.k1.type = avro
# 推给这个机器,自己定义
a1.sinks.k1.hostname = master
# 端口
a1.sinks.k1.port = 10000
# 批量大小
a1.sinks.k1.batch-size = 10
a1.sinks.k1.connect-timeout = 30000

# Use a channel which buffers events in memory
# 内存channels
a1.channels.c1.type = memory
# 管道的容量,字节
a1.channels.c1.capacity = 1000
# 事务的类型,多少条之后source推送到channel或者channel推送到sinks
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
# 组装起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

flume-agent2.properties

a2.sources = r2
a2.sinks = s2
a2.channels = c2

##source中的avro组件是一个接收者服务
a2.sources.r2.type = avro
# 绑定一个ip和端口
a2.sources.r2.bind = master
a2.sources.r2.port = 10000

a2.sources.r2.interceptors = interceptor1
a2.sources.r2.interceptors.interceptor1.type = timestamp

a2.sinks.s2.type=hdfs
# hdfs目录
a2.sinks.s2.hdfs.path = /tmp/flume/%Y-%m-%d
# 文件的前缀,在hdfs上前缀
a2.sinks.s2.hdfs.filePrefix = access_log
# 批次大小,就是文件达到多少条才提交到hdfs
a2.sinks.s2.hdfs.batchSize= 100
# 当前文件存储数据类型,还可以用压缩格式
a2.sinks.s2.hdfs.fileType = DataStream
# 文件的格式类型
a2.sinks.s2.hdfs.writeFormat =Text
a2.sinks.s2.hdfs.useLocalTimeStamp = true
# 达到下面的三个任何一个就按照那个标准生成一个新文件
#滚动生成的文件按大小生成
a2.sinks.s2.hdfs.rollSize = 10240
#滚动生成的文件按行数生成
a2.sinks.s2.hdfs.rollCount = 10
#滚动生成的文件按时间生成,秒
a2.sinks.s2.hdfs.rollInterval = 60

# 整体就是每10分钟滚动生成一个目录
#开启滚动生成目录
#a2.sinks.s2.hdfs.round = true
#以10为一梯度滚动生成,单位在下面
#a2.sinks.s2.hdfs.roundValue = 10
#单位为分钟
#a2.sinks.s2.hdfs.roundUnit = minute

# 管道的类型
a2.channels.c2.type = memory
# 管道的容量,字节
a2.channels.c2.capacity = 1000
# 事务的类型,多少条之后source推送到channel或者channel推送到sinks
a2.channels.c2.transactionCapacity = 100

a2.sources.r2.channels = c2
a2.sinks.s2.channel = c2

启动代理

flume-ng agent --conf-file flume-agent1.properties --name a1 --conf $FLUME_HOME/conf -Dflume.root.logger=INFO,console

flume-ng agent --conf-file flume-agent2.properties --name a2 --conf $FLUME_HOME/conf -Dflume.root.logger=INFO,console

在这里插入图片描述

日志采集和汇总

案例场景

自定义拦截器

需求分析

使用 Flume 采集服务器本地日志，需要按照日志类型的不同，将不同种类的日志发往不同的分析系统。

在实际的开发中，一台服务器产生的日志类型可能有很多种，不同类型的日志可能需要发送到不同的分析系统。

此时会用到 Flume 拓扑结构中的 Multiplexing 结构，Multiplexing的原理是，根据 event 中 Header 的某个 key 的值，将不同的 event 发送到不同的 Channel中，所以我们需要自定义一个 Interceptor，为不同类型的 event 的 Header 中的 key 赋予不同的值。

这里以端口数据模拟日志，以数字（单个）和字母（单个）模拟不同类型的日志，需要自定义 interceptor 区分数字和字母，将其分别发往不同的分析系统（Channel）。

在这里插入图片描述

创建自定义拦截器

Java代码：

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>big-data</artifactId>
        <groupId>org.example</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>

    <artifactId>flume-demo</artifactId>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <flume-version>1.9.0</flume-version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>${flume-version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.8.1</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
                <configuration>
                    <verbose>true</verbose>
                    <fork>true</fork>
                    <executable>${JAVA8_HOME}/bin/javac</executable>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

编写拦截器：CustomInterceptor

package interceptor;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.util.List;

public class CustomInterceptor implements Interceptor {
    @Override
    public void initialize() {

    }

    @Override
    public Event intercept(Event event) {
        byte[] body = event.getBody();
        if (body[0] < 'z' && body[0] > 'a') {
            // 自定义头信息
            event.getHeaders().put("type", "letter");
        } else if (body[0] > '0' && body[0] < '9') {
            // 自定义头信息
            event.getHeaders().put("type", "number");
        }
        return event;
    }

    @Override
    public List<Event> intercept(List<Event> list) {
        for (Event event : list) {
            intercept(event);
        }
        return list;
    }

    @Override
    public void close() {

    }
    public static class Builder implements Interceptor.Builder {
        @Override
        public Interceptor build() {
            return new CustomInterceptor();
        }

        @Override
        public void configure(Context context) {
        }
    }
}

将jar包上传到flume的lib下

编写flume配置文件

1.flume1

# Name the components on this agent
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = master
a1.sources.r1.port = 4444

# 拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = interceptor.CustomInterceptor$Builder

# 选择器
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = type
# 与自定义拦截器中设置的头信息对应
a1.sources.r1.selector.mapping.letter = c1
a1.sources.r1.selector.mapping.number = c2

# Describe the sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = master
a1.sinks.k1.port = 4141

a1.sinks.k2.type=avro
a1.sinks.k2.hostname = master
a1.sinks.k2.port = 4242

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

2.flume2

a2.sources = r1
a2.sinks = k1
a2.channels = c1

a2.sources.r1.type = avro
a2.sources.r1.bind = master
a2.sources.r1.port = 4141

a2.sinks.k1.type = logger

a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

a2.sinks.k1.channel = c1
a2.sources.r1.channels = c1

3.flume3

a3.sources = r1
a3.sinks = k1
a3.channels = c1

a3.sources.r1.type = avro
a3.sources.r1.bind = master
a3.sources.r1.port = 4242

a3.sinks.k1.type = logger

a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

a3.sinks.k1.channel = c1
a3.sources.r1.channels = c1

3.测试

flume2 和 flume3 需要先启动，flume1 需要连接 flume2 和 flume3，若先启动 flume1 会报连接不上（也可以无视错误日志，先启动）

flume-ng agent --conf $FLUME_HOME/conf --name a3 --conf-file flume3 -Dflume.root.logger=INFO,console
flume-ng agent --conf $FLUME_HOME/conf --name a2 --conf-file flume2 -Dflume.root.logger=INFO,console
flume-ng agent --conf $FLUME_HOME/conf --name a1 --conf-file flume1 -Dflume.root.logger=INFO,console

向监控端口发送数据

[root@node1 opt]# telnet master 4444
Trying 192.168.60.128...
Connected to master.
Escape character is '^]'.
hoo^H^H
OK
12354654
OK
646856563
OK
hello world
OK

在这里插入图片描述