flume的环境搭建
flume-ng-1.6.0-cdh5.14.0.tar.gz 提取码:juak
(base) [root@lijia1 install]# tar -zxf flume-ng-1.6.0-cdh5.14.0.tar.gz -C ../bigdata/
(base) [root@lijia1 bigdata]# mv apache-flume-1.6.0-cdh5.14.0-bin/ flume160514
(base) [root@lijia1 bigdata]# cd ./flume160514/
(base) [root@lijia1 flume160514]# cd ./conf/
(base) [root@lijia1 conf]# cp flume-env.sh.template flume-env.sh
(base) [root@lijia1 conf]# vi ./flume-env.sh
export JAVA_HOME=/opt/bigdata/jdk180/
export JAVA_OPTS="-Xms2000m -Xmx2000m -Dcom.sun.management.jmxremote"
(base) [root@lijia1 conf]# mkdir job
(base) [root@lijia1 job]# yum install -y nc
(base) [root@lijia1 job]# nc -lk 44444
[root@lijia1 ~]# yum list telnet* 列出telnet相关的安装包
[root@lijia1 ~]# yum install telnet-server 安装telnet服务
[root@lijia1 ~]# yum install telnet.* 安装telnet客户端
[root@lijia1 ~]# telnet localhost 44444
读取电脑指定端口的内容
[root@lijia1 job]# vi netcat-flume-logger.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
[root@lijia1 flume160514]# ./bin/flume-ng agent --name a1 --conf conf/ --conf-file conf/job/netcat-flume-logger.conf -Dflume.root.logger=INFO,console
读取指定文件内容
[root@lijia1 job]# vi file-flume-logger.conf
a2.sources = r1
a2.sinks = k1
a2.channels = c1
a2.sources.r1.type = exec
a2.sources.r1.command = tail -f /opt/bigdata/flume160514/conf/job/tmp.txt
a2.sinks.k1.type = logger
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
[root@lijia1 flume160514]# ./bin/flume-ng agent --name a2 --conf conf/ --conf-file conf/job/file-flume-logger.conf -Dflume.root.logger=INFO,console
[root@lijia1 prodata]# wc -l events.csv 查看文件总行数
读取指定文件夹中复合正则表达式的文件内容
[root@lijia1 job]# vi ./events-flume-logger.conf
user_friends.sources = userFriendsSource
user_friends.channels = userFriendsChannel
user_friends.sinks = userFriendsSink
user_friends.sources.userFriendsSource.type = spooldir
#指定监控的文件夹
user_friends.sources.userFriendsSource.spoolDir = /opt/kb07file/flumeFile/user_friends
user_friends.sources.userFriendsSource.deserializer = LINE
user_friends.sources.userFriendsSource.deserializer.maxLineLength = 600000
#指定监控的文件名的类型
user_friends.sources.userFriendsSource.includePattern = userFriends_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
user_friends.channels.userFriendsChannel.type = file
user_friends.channels.userFriendsChannel.checkpointDir = /opt/kb07file/flumeFile/checkpoint/userFriends
user_friends.channels.userFriendsChannel.dataDirs = /opt/kb07file/flumeFile/data/userFriends
user_friends.sinks.userFriendsSink.type = hdfs
user_friends.sinks.userFriendsSink.hdfs.fileType = DataStream
user_friends.sinks.userFriendsSink.hdfs.filePrefix = userFriend
user_friends.sinks.userFriendsSink.hdfs.fileSuffix = .csv
#指定传输的路径
user_friends.sinks.userFriendsSink.hdfs.path = hdfs://192.168.153.141:9000/user/userFriend/%Y-%m-%d
user_friends.sinks.userFriendsSink.hdfs.useLocalTimeStamp = true
user_friends.sinks.userFriendsSink.hdfs.batchSize = 640
user_friends.sinks.userFriendsSink.hdfs.rollCount = 0
user_friends.sinks.userFriendsSink.hdfs.rollSize = 64000000
user_friends.sinks.userFriendsSink.hdfs.rollInterval = 30
user_friends.sources.userFriendsSource.channels = userFriendsChannel
user_friends.sinks.userFriendsSink.channel = userFriendsChannel
[root@lijia1 flume160514]# ./bin/flume-ng agent --name events --conf conf/ --conf-file conf/job/events-flume-logger.conf -Dflume.root.logger=INFO,console
[root@zcy01 events]# install user_friends.csv.COMPLETED /opt/kb07file/flumeFile/user_friends/userFriends_2020-08-17.csv
自定义拦截器
1.编写java程序生成jar包放置在lib目录
public class InterceptorDemo implements Interceptor {
private List<Event> addHeaderEvents;
@Override
public void initialize() {
addHeaderEvents = new ArrayList<>();
}
@Override
public Event intercept(Event event) {
Map<String, String> headers = event.getHeaders();
String body = new String(event.getBody());
if(body.startsWith("gree")){
headers.put("type","gree");
}else{
headers.put("type","lijia");
}
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
addHeaderEvents.clear();
for (Event event:list) {
addHeaderEvents.add(intercept(event)) ;
}
return addHeaderEvents;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder{
@Override
public Interceptor build() {
return new InterceptorDemo();
}
@Override
public void configure(Context context) {
}
}
}
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = zcy.InterceptorDemo$Builder
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = type
a1.sources.r1.selector.mapping.gree = c1
a1.sources.r1.selector.mapping.lijia = c2
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.filePrefix = gree
a1.sinks.k1.hdfs.fileSuffix = .csv
a1.sinks.k1.hdfs.path = hdfs://192.168.174.41:9000/user/greedemo/%Y-%m-%d
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.batchSize = 640
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.rollSize = 100
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k2.type = hdfs
a1.sinks.k2.hdfs.fileType = DataStream
a1.sinks.k2.hdfs.filePrefix = lijia
a1.sinks.k2.hdfs.fileSuffix = .csv
a1.sinks.k2.hdfs.path = hdfs://192.168.174.41:9000/user/lijiademo/%Y-%m-%d
a1.sinks.k2.hdfs.useLocalTimeStamp = true
a1.sinks.k2.hdfs.batchSize = 640
a1.sinks.k2.hdfs.rollCount = 0
a1.sinks.k2.hdfs.rollSize = 100
a1.sinks.k2.hdfs.rollInterval = 3
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
作为avro客户端向avro服务端发送avro事件
sinks端
a2.sources = r1
a2.sinks = k1
a2.channels = c1
a2.sources.r1.type = exec
a2.sources.r1.command = tail -f /opt/bigdata/flume160514/conf/job/access.log
a2.sinks.k1.type = avro
a2.sinks.k1.hostname = localhost
a2.sinks.k1.port = 41414
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
sources端
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = avro
a1.sources.r1.bind = localhost
a1.sources.r1.port = 41414
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
监控文件夹并上传至Kafka中
test.channels = c1
test.sinks = k1
test.sources.s1.type = spooldir
test.sources.s1.spoolDir = /opt/kb07file/flumeFile/test
test.sources.s1.deserializer = LINE
test.sources.s1.deserializer.maxLineLength = 60000
test.sources.s1.includePattern = test_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
test.channels.c1.type = file
test.channels.c1.checkpointDir = /opt/kb07file/flumeFile/checkpoint/test
test.channels.c1.dataDir = /opt/kb07file/flumeFile/data/test
test.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
test.sinks.k1.batchSize = 640
test.sinks.k1.brokerList = 192.168.174.41:9092
test.sinks.k1.topic = test
test.sources.s1.channels =c1
test.sinks.k1.channel = c1