flume常用操作

几种flume常用的操作

从控制台输入,本地连接

[root@hadoop1 ~] cd /opt/flume/conf/jobkb09
[root@hadoop1 jobkb09] vi netcat-flume-logger.conf

netcat-flume-logger.conf内容如下

tmp.sources=s
tmp.channels=c
tmp.sinks=k

tmp.sources.s.type=netcat
#设置本地连接
tmp.sources.s.bind=localhost
#设置端口号7777
tmp.sources.s.port=7777

tmp.channels.c.type=memory
tmp.channels.c.capacity=1000
#设置最大读取sources传过来的event数量和传入sinks的最大数量(事务)
tmp.channels.c.transactionCapacity=1000

tmp.sinks.k.type=logger

tmp.sources.s.channels=c
tmp.sinks.k.channel=c

执行命令(注意所以目录不同 )
-c 接conf的位置
-f 接从conf目录开始后执行文件位置

[root@hadoop1 flume]# flume-ng agent --name tmp -c conf/ -f conf/jobkb09/netcat-flume-logger.conf -Dflume.root.logger=INFO,console

紧接着把这台虚拟机复制一台,输入

root@hadoop1 flume]# telnet localhost 7777
Trying ::1...
telnet: connect to address ::1: Connection refused
Trying 127.0.0.1...
Connected to localhost.
Escape character is '^]'.

然后就可以在下面输入内容,在输入执行命令的机器可以接受控制台输入的内容
在这里插入图片描述

读取本地文件

[root@hadoop1 jobkb09] vi file-flume-logger.conf

file-flume-logger.conf内容如下

# 从本地文件输入exec只能是某一天,不能随时间自动更新 file-flume-logger.conf
tmp.sources=s
tmp.channels=c
tmp.sinks=k

tmp.sources.s.type=exec
tmp.sources.s.command=tail -f /opt/flume/conf/jobkb09/tmp/tmp.txt

tmp.channels.c.type=memory
tmp.channels.c.capacity=1000
tmp.channels.c.transactionCapacity=1000

tmp.sinks.s.type=logger

tmp.sources.s.channels=c
tmp.sinks.k.channel=c

tmp.txt

hello friend

执行命令

[root@hadoop1 flume]# flume-ng agent --name tmp -c conf/ -f conf/jobkb09/file-flume-logger.conf -Dflume.root.logger=INFO,console

在这里插入图片描述
往tmp.txt追加内容

[root@hadoop1 tmp]# echo nice to meet you > tmp.txt

在这里插入图片描述
往tmp.txt覆盖内容

[root@hadoop1 tmp]# echo hello >> tmp.txt

在这里插入图片描述

本地读取文件(带筛选器Interceptors),实时更新(随本地时间更新时间目录),并上传至hdfs

首先需要创建几个文件夹

[root@hadoop1 jobkb09]# mkdir dataSourceFile
[root@hadoop1 jobkb09]# cd dataSourceFile
[root@hadoop1 dataSourceFile]# mkdir events
[root@hadoop1 dataSourceFile]# cd ..
[root@hadoop1 jobkb09]# mkdir dataChannelFile
[root@hadoop1 jobkb09]# cd dataChannelFile
[root@hadoop1 dataChannelFile]# mkdir events
[root@hadoop1 dataChannelFile]# cd ..
[root@hadoop1 jobkb09]# mkdir checkpointFile
[root@hadoop1 jobkb09]# cd checkpointFile
[root@hadoop1 checkpointFile]# mkdir events
[root@hadoop1 checkpointFile]# cd ..
[root@hadoop1 jobkb09]# vi file-flume-hdfs.conf

把一个events_2020-12-01.csv格式的文件放入 /opt/flume/conf/jobkb09/dataSourceFile/events/ 下
file-flume-hdfs.conf内容如下

# 从本地文件输入spooldir可以实时更新 file-flume-hdfs.conf
tmp.sources=s
tmp.channels=c
tmp.sinks=k

tmp.sources.s.type=spooldir
tmp.sources.s.spoolDir=/opt/flume/conf/jobkb09/dataSourceFile/events
tmp.sources.s.deserializer=LINE
#读取一行的最大长度,即最大列数,可用 wc -L events 统计
tmp.sources.s.deserializer.maxLineLength=10000
#筛选文件开头,true时为舍去,默认false
tmp.sources.s.interceptors=head_filter
tmp.sources.s.interceptors.head_filter.type=regex_filter
#舍去以event_id开头的
tmp.sources.s.interceptors.head_filter.regex=^event_id*
tmp.sources.s.interceptors.head_filter.excludeEvents=true
#模式匹配,筛选出符合格式要求的文件上传(events_2020-12-01.csv)
tmp.sources.s.includePattern=events_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv

tmp.channels.c.type=file
tmp.channels.c.checkpointDir=/opt/flume/conf/jobkb09/checkpointFile/events
tmp.channels.c.dataDirs=/opt/flume/conf/jobkb09/dataChannelFile/events

tmp.sinks.k.type=hdfs
#写入hdfs的方式
tmp.sinks.k.hdfs.fileType=DataStream
#写入hdfs后文件名前缀
tmp.sinks.k.hdfs.filePrefix=events
#写入hdfs后文件名后缀
tmp.sinks.k.hdfs.fileSuffix=.csv
#路径
tmp.sinks.k.hdfs.path=hdfs://192.168.153.10:9000/kb09workspace/events/%Y-%m-%d
#使用本地时间
tmp.sinks.k.hdfs.useLocalTimeStamp=true
#每个批次刷新到HDFS上的events数量
tmp.sinks.k.hdfs.batchSize=640
#当event达到一定数量时候回滚为目标文件
tmp.sinks.k.hdfs.rollCount=0
#目标文件大小,当超出这个时候形成多个
tmp.sinks.k.hdfs.rollSize=120000000
#间隔多久生成一个目标文件
tmp.sinks.k.hdfs.rollInterval=20

tmp.sources.s.channels=c
tmp.sinks.k.channel=c

执行命令

[root@hadoop1 flume]# flume-ng agent --name tmp -c conf/ -f conf/jobkb09/file-flume-hdfs.conf -Dflume.root.logger=INFO,console

输入192.168.153.10:50070进入网页查看
在这里插入图片描述

使用java代码 自定义筛选器(Interceptors)

创建MAVEN工程并添加依赖

    <dependency>
      <groupId>org.apache.flume</groupId>
        <artifactId>flume-ng-core</artifactId>
      <version>1.6.0</version>
    </dependency

/opt/flume/conf/jobkb09/dataSourceFile/test/test.txt 内容如下

aaaaaaakb09bbbbbbbbbbbbbbbbbbb
bbbbbbbbbkb07cccccccccccccccc
ccccccccckb09cddddddddddddddddd
aaaaaaakb09ddddddddddddddd
bbbbbbbbbkb07rrrrrrrrrrrrrrrrrrr
ccccccccckb092222222222222222
aaaaaaakb0944444444444444444
bbbbbbbbbkb07666666666666666666
ccccccccckb09sssssssssssssssss
aaaaaaakb09wwwwwwwwwwwwww
bbbbbbbbbkb07sssssssssssssss
ccccccccckb09mmmmmmmmmmmmm

需求:将含有kb09和kb7分开存放并保存至hdfs
首先需要创建几个文件夹

[root@hadoop1 jobkb09]# cd dataSourceFile
[root@hadoop1 dataSourceFile]# mkdir test
[root@hadoop1 dataSourceFile]# cd test
[root@hadoop1 test]# vi test.txt
[root@hadoop1 test]# cd ../..
[root@hadoop1 jobkb09]# cd dataChannelFile
[root@hadoop1 dataChannelFile]# mkdir kb09
[root@hadoop1 dataChannelFile]# mkdir kb07
[root@hadoop1 dataChannelFile]# cd ../checkpointFile
[root@hadoop1 checkpointFile]# mkdir kb09
[root@hadoop1 checkpointFile]# mkdir kb07

实现代码

package nj.zb;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * @author ceshi
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: ${todo}
 * @date 2020/12/1 000117:52
 */
public class Interceptordemo1 implements Interceptor {
    private List<Event> addHeaderEvent;
    @Override
    public void initialize() {
        addHeaderEvent = new ArrayList<>();
    }

    @Override
    public Event intercept(Event event) {
        byte[] body = event.getBody();
        Map<String, String> headers = event.getHeaders();
        String bodyStr = new String(body);
        if (bodyStr.contains("kb09")){
            headers.put("type","kb09");
        }else if (bodyStr.contains("kb07")){
            headers.put("type","kb07");
        }
        return event;
    }

    @Override
    public List<Event> intercept(List<Event> events) {
        addHeaderEvent.clear();
        for (Event event : events){
            addHeaderEvent.add(intercept(event));
        }
        return addHeaderEvent;
    }

    @Override
    public void close() {

    }
    public static class Builder implements Interceptor.Builder{

        @Override
        public Interceptor build() {
            return new Interceptordemo1();
        }

        @Override
        public void configure(Context context) {

        }
    }
}

打包jar包并放到 /opt/flume/lib 目录下

[root@hadoop1 jobkb09]# vi file1-flume-hdfs.conf

file1-flume-hdfs.conf内容如下

ict.sources=ictSource
ict.channels=ictChannel1 ictChannel2
ict.sinks=ictSink1 ictSink2

ict.sources.ictSource.type=spooldir
ict.sources.ictSource.spoolDir=/opt/flume/conf/jobkb09/dataSourceFile/test
ict.sources.ictSource.deserializer=LINE
ict.sources.ictSource.deserializer.maxLineLength=1000
ict.sources.ictSource.includePattern=test.txt
ict.sources.ictSource.interceptors=interceptor1
ict.sources.ictSource.interceptors.interceptor1.type=nj.zb.Interceptordemo1$Builder
ict.sources.ictSource.selector.type=multiplexing
ict.sources.ictSource.selector.header=type
ict.sources.ictSource.selector.mapping.kb09=ictChannel1
ict.sources.ictSource.selector.mapping.kb07=ictChannel2

ict.channels.ictChannel1.type=file
ict.channels.ictChannel1.checkpointDir=/opt/flume/conf/jobkb09/checkpointFile/kb09
ict.channels.ictChannel1.dataDirs=/opt/flume/conf/jobkb09/dataChannelFile/kb09

ict.channels.ictChannel2.type=file
ict.channels.ictChannel2.checkpointDir=/opt/flume/conf/jobkb09/checkpointFile/kb07
ict.channels.ictChannel2.dataDirs=/opt/flume/conf/jobkb09/dataChannelFile/kb07

ict.sinks.ictSink1.type=hdfs
ict.sinks.ictSink1.hdfs.fileType=DataStream
ict.sinks.ictSink1.hdfs.filePrefix=kb09
ict.sinks.ictSink1.hdfs.fileSuffix=.csv
ict.sinks.ictSink1.hdfs.path=hdfs://192.168.153.10:9000/kb09workspace/user/kb09/%Y-%m-%d
ict.sinks.ictSink1.hdfs.useLocalTimeStamp=true
ict.sinks.ictSink1.hdfs.batchSize=640
ict.sinks.ictSink1.hdfs.rollCount=0
ict.sinks.ictSink1.hdfs.rollSize=10000
ict.sinks.ictSink1.hdfs.rollInterval=3

ict.sinks.ictSink2.type=hdfs
ict.sinks.ictSink2.hdfs.fileType=DataStream
ict.sinks.ictSink2.hdfs.filePrefix=kb07
ict.sinks.ictSink2.hdfs.fileSuffix=.csv
ict.sinks.ictSink2.hdfs.path=hdfs://192.168.153.10:9000/kb09workspace/user/kb07/%Y-%m-%d
ict.sinks.ictSink2.hdfs.useLocalTimeStamp=true
ict.sinks.ictSink2.hdfs.batchSize=640
ict.sinks.ictSink2.hdfs.rollCount=0
ict.sinks.ictSink2.hdfs.rollSize=10000
ict.sinks.ictSink2.hdfs.rollInterval=3

ict.sources.ictSource.channels=ictChannel1 ictChannel2
ict.sinks.ictSink1.channel=ictChannel1
ict.sinks.ictSink2.channel=ictChannel2

执行命令

[root@hadoop1 flume]# flume-ng agent --name ict -c conf/ -f conf/jobkb09/file1-flume-hdfs.conf -Dflume.root.logger=INFO,console

输入192.168.153.10:50070进入网页查看
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值