模拟数据
u14519,e1,mall,1614674623000
u11822,e1,waimai,1614674624000
u14539,e1,mall,1614674623000
u11842,e1,waimai,1614674624000
u14559,e1,mall,1614674623000
u11862,e1,mall,1614674624000
u14579,e1,waimai,1614674623000
u11882,e1,mall,1614674624000
u14599,e1,waimai,1614674623000
u11812,e1,waimai,1614674624000
u14529,e1,mall,1614674623000
u11832,e1,waimai,1614674624000
需求 :a.log里有两类log ,waimai的和mall的,我现在需要用flume采集log数据,waimai的采集到hdfs,mall的采集到kafka。
需要定义两个channel,两个sink 一个channel接hdfs,一个接kafka,我们需要配置一个多路选择器,flume采集到一条数据要放到哪个channel 需要根据这个数据的header中的标记信息判断,如果是waimai放到hdfs channel,否则kafka channel
但是数据header中肯定是没有这个flag信息的,所以需要我们写一个拦截器,每条数据采集到之后先进行处理-- 打标记 。
taildir-multiplexing-hdfs-kafka.properties
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2
#两个channel
a1.sources.r1.channels = c1 c2
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = g1
a1.sources.r1.filegroups.g1 = /logdata/a.*
#一个拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = cn.ws.yiee.flume.MultiplexingInterceptor$MultiplexingInterceptorBuilder
#指定要拦截的字段在一行中的位置,flag和时间戳,为了以后可以复用,拦截别的字段
a1.sources.r1.interceptors.i1.flagfield = 2
a1.sources.r1.interceptors.i1.timestampfield = 3
#一个多路选择器
a1.sources.r1.selector.type = multiplexing
#header的要判断标识的key
a1.sources.r1.selector.header = flag
#如果flag == mall 去channle c1 或者 flag == waimai 去channle c2
a1.sources.r1.selector.mapping.mall = c1
a1.sources.r1.selector.mapping.waimai = c2
#以上两种都不是 去c2
a1.sources.r1.selector.default = c2
#两个channel类型 memory类型
a1.channels.c1.type = memory
a1.channels.c2.type = memory
#配置sink
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = dream1:9092,dream2:9092,dream3:9092
a1.sinks.k1.kafka.topic = mall
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k2.channel = c2
a1.sinks.k2.type = hdfs
a1.sinks.k2.hdfs.path = hdfs://dream1:9000/waimai/%Y-%m-%d/%H
a1.sinks.k2.hdfs.filePrefix = doitedu-log-
a1.sinks.k2.hdfs.fileSuffix = .log
a1.sinks.k2.hdfs.rollSize = 268435456
a1.sinks.k2.hdfs.rollInterval = 120
a1.sinks.k2.hdfs.rollCount = 0
#默认是SequenceFile,我们为了直观看数据,改成DataStream
a1.sinks.k2.hdfs.fileType = DataStream
自定义拦截器
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.9.0</version>
<scope>provided</scope>
</dependency>
package com.ws;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.util.List;
import java.util.Map;
public class MultiplexingInterceptor implements Interceptor {
private Integer flagIndex;
private Integer timeStampIndex;
public MultiplexingInterceptor(Integer flagIndex, Integer timeStampIndex) {
this.flagIndex = flagIndex;
this.timeStampIndex = timeStampIndex;
}
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
byte[] body = event.getBody();
String line = new String(body);
String[] split = line.split(",");
Map<String, String> headers = event.getHeaders();
// 根据传入的index获取指定的值 mall/waimai
headers.put("flag",split[flagIndex]);
// 时间戳,这个是hdfs生成目录用的
headers.put("timestamp",split[timeStampIndex]);
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
for (Event event : list) {
intercept(event);
}
return list;
}
@Override
public void close() {
}
public static class MultiplexingInterceptorBuilder implements Interceptor.Builder{
Integer flagIndex;
Integer timeStampIndex;
@Override
public Interceptor build() {
MultiplexingInterceptor interceptor = new MultiplexingInterceptor(flagIndex, timeStampIndex);
return interceptor;
}
@Override
public void configure(Context context) {
flagIndex = context.getInteger(Constants.FLAG);
timeStampIndex = context.getInteger(Constants.TIMESTAMP);
}
}
}
package com.ws;
public class Constants {
public static final String FLAG = "flagfield";
public static final String TIMESTAMP = "timestampfield";
}
创建模拟数据
-- 模拟日志生成的脚本:
while true
do
if [ $(($RANDOM % 2)) -eq 0 ]
then
echo "u$RANDOM,e1,waimai,`date +%s`000" >> a.log
else
echo "u$RANDOM,e1,mall,`date +%s`000" >> a.log
fi
sleep 0.2
done
启动flume agent
[root@dream1 flume-1.9.0]# bin/flume-ng agent -c conf/ -f agentconf/taildir-multiplexing-hdfs-kafka.properties -n a1 -Dflume.root.logger=DEBUG,console
查看kakfa
[root@dream2 ~]# /opt/apps/kafka_2.11-2.1.1/bin/kafka-console-consumer.sh --bootstrap-server dream1:9092,dream2:9092,dream3:9092 --topic mall
u12865,e1,mall,1614674621000
u12532,e1,mall,1614674621000
u28327,e1,mall,1614674622000
u11954,e1,mall,1614674622000
u14599,e1,mall,1614674623000
u11842,e1,mall,1614674624000
查看hdfs
[root@dream2 ~]# hadoop fs -tail /waimai/2021-03-02/16/doitedu-log-.1614674535986.log
21/03/02 16:44:55 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
614674645000
u29646,e1,waimai,1614674645000
u2987,e1,waimai,1614674645000
u19265,e1,waimai,1614674645000
u7311,e1,waimai,1614674646000
u611,e1,waimai,1614674646000
u23499,e1,waimai,1614674647000
u32585,e1,waimai,1614674647000
u8615,e1,waimai,1614674647000
u1546,e1,waimai,1614674647000
u32598,e1,waimai,1614674648000