前面介绍了如何从kafka获取数据,那么当大量数据格式不一致的时候想对某一种类型进行处理该如何实现呢?
flink为我们提供了一套处理方式,分流处理
一、分流场景
原Stream流需要拆分为Stream1和Stream2流
Stream1需要再次拆分为StreamA,StreamB流
Stream2需要再次拆分为StreamC,StreamD流
二、分流方式
flink为我们提供了如下三种分流方式
1. fliter分流
2. split分流 (只能一次分流,分流后的流不能继续分流)
3. side output分流
前置配置
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//kafka基本配置,0.8版本
Properties properties = new Properties();
properties.setProperty("zookeeper.connect", "zk地址");
properties.put("bootstrap.servers", "kafka地址");
properties.put("group.id", "groupid");
properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.put("auto.offset.reset", "earliest");
//获取日志
FlinkKafkaConsumer08<String> kafkaSource = new FlinkKafkaConsumer08<>(
"topic",
new SimpleStringSchema(), properties);
//获取flink数据流
DataStream<String> logSource = env.addSource(kafkaSource);
//数据流转换为Object对象
DataStream<Object> dataStream = logSource.map(new MapFunction<String, Object>() {
@Override
public Object map(String s) throws Exception {
//转换为Object对象
ObjectMapper objectMapper = new ObjectMapper();
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
Object obj= objectMapper.readValue(s.getBytes(), Object.class);
return obj;
}
});
fliter分流
SingleOutputStreamOperator<Object> filterStream = dataStream.filter(new FilterFunction<Object>() {
@Override
public boolean filter(Object s) throws Exception {
if(s.getType().equals("type1")) {
return true;
}
return false;
}
});
split分流
//flink split流已被标记为过期
SplitStream<Object> splitStream = dataStream.split(new OutputSelector<Object>() {
@Override
public Iterable<String> select(Object object) {
List<String> tags = new ArrayList<>();
String type = object.getType();
if (type.equals("type1")) {
tags.add("stream1");
} else if (type.equals("type2")) {
tags.add("stream2");
}
return tags;
}
});
DataStream<Object> stream1 = splitStream.select("stream1");
DataStream<Object> stream2 = splitStream.select("stream2");
side output分流
//数据分流定义tag
OutputTag<Object> stream1OutputTag = new OutputTag<Object>("stream1") {
};
OutputTag<Object> stream2OutputTag = new OutputTag<Object>("stream2") {
};
//flink数据流拆分
SingleOutputStreamOperator<Object> dataStreamSide = dataStream.process(new ProcessFunction<Object, Object>() {
@Override
public void processElement(Object object, Context context, Collector<Object> collector) throws Exception {
if(object.getSystem() == null) {
object.setSystem("unknow");
}
//根据日志类型进行解析
if (object.getType().equals("type1")) {
context.output(stream1OutputTag, object);
} else {
context.output(stream2OutputTag, object);
}
collector.collect(object);
}
});
//拆分好的流
DataStream<Object> stream1 = dataStream.getSideOutput(stream1OutputTag);
DataStream<Object> stream2 = dataStream.getSideOutput(stream2OutputTag);