文章目录
几种flume常用的操作
从控制台输入,本地连接
[root@hadoop1 ~] cd /opt/flume/conf/jobkb09
[root@hadoop1 jobkb09] vi netcat-flume-logger.conf
netcat-flume-logger.conf内容如下
tmp.sources=s
tmp.channels=c
tmp.sinks=k
tmp.sources.s.type=netcat
#设置本地连接
tmp.sources.s.bind=localhost
#设置端口号7777
tmp.sources.s.port=7777
tmp.channels.c.type=memory
tmp.channels.c.capacity=1000
#设置最大读取sources传过来的event数量和传入sinks的最大数量(事务)
tmp.channels.c.transactionCapacity=1000
tmp.sinks.k.type=logger
tmp.sources.s.channels=c
tmp.sinks.k.channel=c
执行命令(注意所以目录不同 )
-c 接conf的位置
-f 接从conf目录开始后执行文件位置
[root@hadoop1 flume]# flume-ng agent --name tmp -c conf/ -f conf/jobkb09/netcat-flume-logger.conf -Dflume.root.logger=INFO,console
紧接着把这台虚拟机复制一台,输入
root@hadoop1 flume]# telnet localhost 7777
Trying ::1...
telnet: connect to address ::1: Connection refused
Trying 127.0.0.1...
Connected to localhost.
Escape character is '^]'.
然后就可以在下面输入内容,在输入执行命令的机器可以接受控制台输入的内容
读取本地文件
[root@hadoop1 jobkb09] vi file-flume-logger.conf
file-flume-logger.conf内容如下
# 从本地文件输入exec只能是某一天,不能随时间自动更新 file-flume-logger.conf
tmp.sources=s
tmp.channels=c
tmp.sinks=k
tmp.sources.s.type=exec
tmp.sources.s.command=tail -f /opt/flume/conf/jobkb09/tmp/tmp.txt
tmp.channels.c.type=memory
tmp.channels.c.capacity=1000
tmp.channels.c.transactionCapacity=1000
tmp.sinks.s.type=logger
tmp.sources.s.channels=c
tmp.sinks.k.channel=c
tmp.txt
hello friend
执行命令
[root@hadoop1 flume]# flume-ng agent --name tmp -c conf/ -f conf/jobkb09/file-flume-logger.conf -Dflume.root.logger=INFO,console
往tmp.txt追加内容
[root@hadoop1 tmp]# echo nice to meet you > tmp.txt
往tmp.txt覆盖内容
[root@hadoop1 tmp]# echo hello >> tmp.txt
本地读取文件(带筛选器Interceptors),实时更新(随本地时间更新时间目录),并上传至hdfs
首先需要创建几个文件夹
[root@hadoop1 jobkb09]# mkdir dataSourceFile
[root@hadoop1 jobkb09]# cd dataSourceFile
[root@hadoop1 dataSourceFile]# mkdir events
[root@hadoop1 dataSourceFile]# cd ..
[root@hadoop1 jobkb09]# mkdir dataChannelFile
[root@hadoop1 jobkb09]# cd dataChannelFile
[root@hadoop1 dataChannelFile]# mkdir events
[root@hadoop1 dataChannelFile]# cd ..
[root@hadoop1 jobkb09]# mkdir checkpointFile
[root@hadoop1 jobkb09]# cd checkpointFile
[root@hadoop1 checkpointFile]# mkdir events
[root@hadoop1 checkpointFile]# cd ..
[root@hadoop1 jobkb09]# vi file-flume-hdfs.conf
把一个events_2020-12-01.csv格式的文件放入 /opt/flume/conf/jobkb09/dataSourceFile/events/ 下
file-flume-hdfs.conf内容如下
# 从本地文件输入spooldir可以实时更新 file-flume-hdfs.conf
tmp.sources=s
tmp.channels=c
tmp.sinks=k
tmp.sources.s.type=spooldir
tmp.sources.s.spoolDir=/opt/flume/conf/jobkb09/dataSourceFile/events
tmp.sources.s.deserializer=LINE
#读取一行的最大长度,即最大列数,可用 wc -L events 统计
tmp.sources.s.deserializer.maxLineLength=10000
#筛选文件开头,true时为舍去,默认false
tmp.sources.s.interceptors=head_filter
tmp.sources.s.interceptors.head_filter.type=regex_filter
#舍去以event_id开头的
tmp.sources.s.interceptors.head_filter.regex=^event_id*
tmp.sources.s.interceptors.head_filter.excludeEvents=true
#模式匹配,筛选出符合格式要求的文件上传(events_2020-12-01.csv)
tmp.sources.s.includePattern=events_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
tmp.channels.c.type=file
tmp.channels.c.checkpointDir=/opt/flume/conf/jobkb09/checkpointFile/events
tmp.channels.c.dataDirs=/opt/flume/conf/jobkb09/dataChannelFile/events
tmp.sinks.k.type=hdfs
#写入hdfs的方式
tmp.sinks.k.hdfs.fileType=DataStream
#写入hdfs后文件名前缀
tmp.sinks.k.hdfs.filePrefix=events
#写入hdfs后文件名后缀
tmp.sinks.k.hdfs.fileSuffix=.csv
#路径
tmp.sinks.k.hdfs.path=hdfs://192.168.153.10:9000/kb09workspace/events/%Y-%m-%d
#使用本地时间
tmp.sinks.k.hdfs.useLocalTimeStamp=true
#每个批次刷新到HDFS上的events数量
tmp.sinks.k.hdfs.batchSize=640
#当event达到一定数量时候回滚为目标文件
tmp.sinks.k.hdfs.rollCount=0
#目标文件大小,当超出这个时候形成多个
tmp.sinks.k.hdfs.rollSize=120000000
#间隔多久生成一个目标文件
tmp.sinks.k.hdfs.rollInterval=20
tmp.sources.s.channels=c
tmp.sinks.k.channel=c
执行命令
[root@hadoop1 flume]# flume-ng agent --name tmp -c conf/ -f conf/jobkb09/file-flume-hdfs.conf -Dflume.root.logger=INFO,console
输入192.168.153.10:50070进入网页查看
使用java代码 自定义筛选器(Interceptors)
创建MAVEN工程并添加依赖
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.6.0</version>
</dependency
/opt/flume/conf/jobkb09/dataSourceFile/test/test.txt 内容如下
aaaaaaakb09bbbbbbbbbbbbbbbbbbb
bbbbbbbbbkb07cccccccccccccccc
ccccccccckb09cddddddddddddddddd
aaaaaaakb09ddddddddddddddd
bbbbbbbbbkb07rrrrrrrrrrrrrrrrrrr
ccccccccckb092222222222222222
aaaaaaakb0944444444444444444
bbbbbbbbbkb07666666666666666666
ccccccccckb09sssssssssssssssss
aaaaaaakb09wwwwwwwwwwwwww
bbbbbbbbbkb07sssssssssssssss
ccccccccckb09mmmmmmmmmmmmm
需求:将含有kb09和kb7分开存放并保存至hdfs
首先需要创建几个文件夹
[root@hadoop1 jobkb09]# cd dataSourceFile
[root@hadoop1 dataSourceFile]# mkdir test
[root@hadoop1 dataSourceFile]# cd test
[root@hadoop1 test]# vi test.txt
[root@hadoop1 test]# cd ../..
[root@hadoop1 jobkb09]# cd dataChannelFile
[root@hadoop1 dataChannelFile]# mkdir kb09
[root@hadoop1 dataChannelFile]# mkdir kb07
[root@hadoop1 dataChannelFile]# cd ../checkpointFile
[root@hadoop1 checkpointFile]# mkdir kb09
[root@hadoop1 checkpointFile]# mkdir kb07
实现代码
package nj.zb;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author ceshi
* @Title: ${file_name}
* @Package ${package_name}
* @Description: ${todo}
* @date 2020/12/1 000117:52
*/
public class Interceptordemo1 implements Interceptor {
private List<Event> addHeaderEvent;
@Override
public void initialize() {
addHeaderEvent = new ArrayList<>();
}
@Override
public Event intercept(Event event) {
byte[] body = event.getBody();
Map<String, String> headers = event.getHeaders();
String bodyStr = new String(body);
if (bodyStr.contains("kb09")){
headers.put("type","kb09");
}else if (bodyStr.contains("kb07")){
headers.put("type","kb07");
}
return event;
}
@Override
public List<Event> intercept(List<Event> events) {
addHeaderEvent.clear();
for (Event event : events){
addHeaderEvent.add(intercept(event));
}
return addHeaderEvent;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder{
@Override
public Interceptor build() {
return new Interceptordemo1();
}
@Override
public void configure(Context context) {
}
}
}
打包jar包并放到 /opt/flume/lib 目录下
[root@hadoop1 jobkb09]# vi file1-flume-hdfs.conf
file1-flume-hdfs.conf内容如下
ict.sources=ictSource
ict.channels=ictChannel1 ictChannel2
ict.sinks=ictSink1 ictSink2
ict.sources.ictSource.type=spooldir
ict.sources.ictSource.spoolDir=/opt/flume/conf/jobkb09/dataSourceFile/test
ict.sources.ictSource.deserializer=LINE
ict.sources.ictSource.deserializer.maxLineLength=1000
ict.sources.ictSource.includePattern=test.txt
ict.sources.ictSource.interceptors=interceptor1
ict.sources.ictSource.interceptors.interceptor1.type=nj.zb.Interceptordemo1$Builder
ict.sources.ictSource.selector.type=multiplexing
ict.sources.ictSource.selector.header=type
ict.sources.ictSource.selector.mapping.kb09=ictChannel1
ict.sources.ictSource.selector.mapping.kb07=ictChannel2
ict.channels.ictChannel1.type=file
ict.channels.ictChannel1.checkpointDir=/opt/flume/conf/jobkb09/checkpointFile/kb09
ict.channels.ictChannel1.dataDirs=/opt/flume/conf/jobkb09/dataChannelFile/kb09
ict.channels.ictChannel2.type=file
ict.channels.ictChannel2.checkpointDir=/opt/flume/conf/jobkb09/checkpointFile/kb07
ict.channels.ictChannel2.dataDirs=/opt/flume/conf/jobkb09/dataChannelFile/kb07
ict.sinks.ictSink1.type=hdfs
ict.sinks.ictSink1.hdfs.fileType=DataStream
ict.sinks.ictSink1.hdfs.filePrefix=kb09
ict.sinks.ictSink1.hdfs.fileSuffix=.csv
ict.sinks.ictSink1.hdfs.path=hdfs://192.168.153.10:9000/kb09workspace/user/kb09/%Y-%m-%d
ict.sinks.ictSink1.hdfs.useLocalTimeStamp=true
ict.sinks.ictSink1.hdfs.batchSize=640
ict.sinks.ictSink1.hdfs.rollCount=0
ict.sinks.ictSink1.hdfs.rollSize=10000
ict.sinks.ictSink1.hdfs.rollInterval=3
ict.sinks.ictSink2.type=hdfs
ict.sinks.ictSink2.hdfs.fileType=DataStream
ict.sinks.ictSink2.hdfs.filePrefix=kb07
ict.sinks.ictSink2.hdfs.fileSuffix=.csv
ict.sinks.ictSink2.hdfs.path=hdfs://192.168.153.10:9000/kb09workspace/user/kb07/%Y-%m-%d
ict.sinks.ictSink2.hdfs.useLocalTimeStamp=true
ict.sinks.ictSink2.hdfs.batchSize=640
ict.sinks.ictSink2.hdfs.rollCount=0
ict.sinks.ictSink2.hdfs.rollSize=10000
ict.sinks.ictSink2.hdfs.rollInterval=3
ict.sources.ictSource.channels=ictChannel1 ictChannel2
ict.sinks.ictSink1.channel=ictChannel1
ict.sinks.ictSink2.channel=ictChannel2
执行命令
[root@hadoop1 flume]# flume-ng agent --name ict -c conf/ -f conf/jobkb09/file1-flume-hdfs.conf -Dflume.root.logger=INFO,console
输入192.168.153.10:50070进入网页查看