原文链接:https://blog.csdn.net/wangpei1949/article/details/99698868
本文总结Flink DataStream 中非常有用的功能,分流和合流。
分流(Split/Side)
分流可以将一个流拆分成多个流。
基于Split...Select...
package com.bigdata.flink;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.ArrayList;
/**
* Author: Wang Pei
* Summary:
* 分流:基于Split-Select
*/
@Slf4j
public class SplitStreamBySplit {
public static void main(String[] args) throws Exception{
/**运行环境*/
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
/**输入数据源*/
DataStreamSource<Tuple3<String, String, String>> source = env.fromElements(
new Tuple3<>("productID1", "click", "user_1"),
new Tuple3<>("productID1", "click", "user_2"),
new Tuple3<>("productID1", "browse", "user_1"),
new Tuple3<>("productID2", "browse", "user_1"),
new Tuple3<>("productID2", "click", "user_2"),
new Tuple3<>("productID2", "click", "user_1")
);
/**1、定义拆分逻辑*/
SplitStream<Tuple3<String, String, String>> splitStream = source.split(new OutputSelector<Tuple3<String, String, String>>() {
@Override
public Iterable<String> select(Tuple3<String, String, String> value) {
ArrayList<String> output = new ArrayList<>();
if (value.f0.equals("productID1")) {
output.add("productID1");
} else if (value.f0.equals("productID2")) {
output.add("productID2");
}
return output;
}
});
/**2、将流真正拆分出来*/
splitStream.select("productID1").print();
env.execute();
}
}
注意:
Split...Select...中Split只是对流中的数据打上标记,并没有将流真正拆分。可通过Select算子将流真正拆分出来。Split...Select...不能连续分流。即不能Split...Select...Split,但可以如Split...Select...Filter...Split。Split...Select...已经过时,推荐使用更灵活的侧路输出(Side-Output),如下。
基于Side-Output
package com.bigdata.flink;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
/**
* Author: Wang Pei
* Summary:
* 分流:基于SideOutput(侧路输出)
*/
@Slf4j
public class SplitStreamBySideOutput {
public static void main(String[] args) throws Exception{
/**运行环境*/
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
/**输入数据源*/
DataStreamSource<Tuple3<String, String, String>> source = env.fromElements(
new Tuple3<>("productID1", "click", "user_1"),
new Tuple3<>("productID1", "click", "user_2"),
new Tuple3<>("productID1", "browse", "user_1"),
new Tuple3<>("productID2", "browse", "user_1"),
new Tuple3<>("productID2", "click", "user_2"),
new Tuple3<>("productID2", "click", "user_1")
);
/**1、定义OutputTag*/
OutputTag<Tuple3<String, String, String>> sideOutputTag = new OutputTag<Tuple3<String, String, String>>("side-output-tag"){};
/**2、在ProcessFunction中处理主流和分流*/
SingleOutputStreamOperator<Tuple3<String, String, String>> processedStream = source.process(new ProcessFunction<Tuple3<String, String, String>, Tuple3<String, String, String>>() {
@Override
public void processElement(Tuple3<String, String, String> value, Context ctx, Collector<Tuple3<String, String, String>> out) throws Exception {
//侧流-只输出特定数据
if (value.f0.equals("productID1")) {
ctx.output(sideOutputTag, value);
//主流
}else {
out.collect(value);
}
}
});
//获取主流
processedStream.print();
//获取侧流
processedStream.getSideOutput(sideOutputTag).print();
env.execute();
}
}
注意:
Side-Output是从Flink 1.3.0开始提供的功能,支持了更灵活的多路输出。Side-Output可以以侧流的形式,以不同于主流的数据类型,向下游输出指定条件的数据、异常数据、迟到数据等等。Side-Output通过ProcessFunction将数据发送到侧路OutputTag。
合流(Union/Connect)
合流可以将多个流合并成一个流。
基于Union
package com.bigdata.flink;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* Author: Wang Pei
* Summary:
* 合流:基于Union
*/
@Slf4j
public class UnionStreamByUnion {
public static void main(String[] args) throws Exception{
/**运行环境*/
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
/**输入数据源source1*/
DataStreamSource<Tuple3<String, String, String>> source1 = env.fromElements(
new Tuple3<>("productID1", "click", "user_1")
);
/**输入数据源source2*/
DataStreamSource<Tuple3<String, String, String>> source2 = env.fromElements(
new Tuple3<>("productID3", "click", "user_1"),
new Tuple3<>("productID3", "click", "user_2")
);
/**输入数据源source3*/
DataStreamSource<Tuple3<String, String, String>> source3 = env.fromElements(
new Tuple3<>("productID2", "browse", "user_1"),
new Tuple3<>("productID2", "click", "user_2"),
new Tuple3<>("productID2", "click", "user_1")
);
/**合并流*/
source1.union(source2,source3).print();
env.execute();
}
}
注意:
Union可以将两个或多个同数据类型的流合并成一个流。
基于Connect
package com.bigdata.flink;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
/**
* Author: Wang Pei
* Summary:
* 合流:基于Connect
*/
@Slf4j
public class UnionStreamByConnect {
public static void main(String[] args) throws Exception{
/**运行环境*/
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
/**输入数据源source1*/
DataStreamSource<Tuple3<String, String, String>> source1 = env.fromElements(
new Tuple3<>("productID1", "click", "user_1")
);
/**输入数据源source2*/
DataStreamSource<String> source2 = env.fromElements(
"productID3:click:user_1",
"productID3:browse:user_2"
);
/**1、合并流*/
ConnectedStreams<Tuple3<String, String, String>, String> connectedStream = source1.connect(source2);
/**2、用CoMap处理合并后的流*/
SingleOutputStreamOperator<Tuple2<String, String>> resultStream = connectedStream.map(new CoMapFunction<Tuple3<String, String, String>, String, Tuple2<String, String>>() {
//定义第一个流的处理逻辑
@Override
public Tuple2<String, String> map1(Tuple3<String, String, String> value) throws Exception {
return new Tuple2<>(value.f1, value.f2);
}
//定义第二个流的处理逻辑
@Override
public Tuple2<String, String> map2(String value) throws Exception {
String[] valueSplit = value.split(":");
return new Tuple2<>(valueSplit[1], valueSplit[2]);
}
});
resultStream.print();
env.execute();
}
}
注意:
Connect可以用来合并两种不同类型的流。Connect合并后,可用map中的CoMapFunction或flatMap中的CoFlatMapFunction来对合并流中的每个流进行处理。
628

被折叠的 条评论
为什么被折叠?



