前言
用户通过算子能将一个或多个 DataStream 转换成新的 DataStream,在应用程序中可以将多个数据转换算子合并成一个复杂的数据流拓扑。
接下来介绍部分算子的使用方法,大家直接复制粘贴就好,简单粗暴
搭建IDEA的开发环境
记得下载一个Lombok插件
pom文件如下:
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<flink.version>1.9.2</flink.version>
<scala.binary.version>2.11</scala.binary.version>
</properties>
<dependencies>
<!-- Apache Flink dependencies -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
UserBean类如下:
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* Created with IntelliJ IDEA
*
* @Auther: erfeng
* @Date:2021/09/02/9:50
* @Description:
*/
@Data
@AllArgsConstructor
@NoArgsConstructor
public class UserBean {
private String userID;
private long eventTime;
private String eventType;
private String productID;
private int productPrice;
}
做好上述准备后,接下来让我一起来认识一下各种算子及使用吧
各种算子介绍及使用
Aggregate
/**
* aggregate filter flatmap fold keyby map reduce
* Summary:
* Aggregate: min()、minBy()、max()、maxBy() 滚动聚合并输出每次滚动聚合后的结果
*/
public class DataStreamAggregateOperator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 输入: 用户行为。某个用户在某个时刻点击或浏览了某个商品,以及商品的价格。
ArrayList<UserBean> userBeans = new ArrayList<>();
UserBean userbeanlog1 = new UserBean();
userbeanlog1.setUserID("userID1");
userbeanlog1.setProductID("productID3");
userbeanlog1.setProductPrice(10);
userBeans.add(userbeanlog1);
UserBean userbeanlog2 = new UserBean();
userbeanlog2.setUserID("userID2");
userbeanlog2.setProductPrice(10);
userBeans.add(userbeanlog2);
UserBean userbeanlog3 = new UserBean();
userbeanlog3.setUserID("userID1");
userbeanlog3.setProductID("productID5");
userbeanlog3.setProductPrice(30);
userBeans.add(userbeanlog3);
DataStreamSource<UserBean> source = env.fromCollection(userBeans);
// 转换: KeyBy对数据重分区
// 这里, UserActionLog是POJO类型,也可通过keyBy("userID")进行分区
KeyedStream<UserBean, String> keyedStream = source.keyBy(new KeySelector<UserBean, String>() {
@Override
public String getKey(UserBean value) throws Exception {
return value.getUserID();
}
});
//userid1 10 30 userid2 10
// 转换: Aggregate并输出
// 滚动求和并输出
//keyedStream.sum("productPrice").print();
// 滚动求最大值并输出
// keyedStream.max("productPrice").print();
// 滚动求最大值并输出
// keyedStream.maxBy("productPrice").print();
// 滚动求最小值并输出
//keyedStream.min("productPrice").print();
// 滚动求最小值并输出
//keyedStream.minBy("productPrice").print();
env.execute("flink aggregate operator");
}
}
Union
/**
* Summary:
* union: 将多个流合并到一个流中,以便对合并的流进行统一处理,
* 有点类似于Storm中的将上一级的两个Bolt数据汇聚到这一级同一个Bolt中。注意,合并的流类型需要一致
*/
public class DataStreamUnionOperator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env=StreamExecutionEnvironment.getExecutionEnvironment();
DataStream dataStream1=env.fromElements(
Tuple1.of("flink"),
Tuple1.of("spark"),
Tuple1.of("hadoop")
);
DataStream dataStream2=env.fromElements(
Tuple1.of("oracle"),
Tuple1.of("mysql"),
Tuple1.of("sqlserver")
);
dataStream1.union(dataStream2).print();
env.execute("flink union operator ");
}
}
// 输出到控制台
// 3> (flink)
// 4> (spark)
// 4> (mysql)
// 1> (hadoop)
// 1> (sqlserver)
// 3> (oracle)
Split
/**
* Summary:
* split: 根据规则把一个数据流切分成多个流。
* split分流,只能一次分流,分流后的流不能继续分流。
* (Flink 1.9以前可以输出,但是第二次split将会失效;
* Flink 1.9 如果二次split,直接报错并提示使用side-outputs)
*/
public class DataStreamSplitOperator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env=StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<Integer> dataStream=env.fromElements(
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
);
//定义拆分逻辑
SplitStream<Integer> split=dataStream.split(new OutputSelector<Integer>() {
@Override
public Iterable<String> select(Integer integer) {
List<String> output=new ArrayList<>();
if(integer%2==0){
//偶数
output.add("even");
}else {
//奇数
output.add("odd");
}
return output;
}
});
//选择一个或多个切分后的流
DataStream<Integer> evenStream=split.select("even");
// DataStream<Integer> oddStream=split.select("odd");
// DataStream<Integer> moreStream =split.select("odd","even");
evenStream.print().setParallelism(1);
env.execute("flink split operator");
}
}
//输出结果
//2
//4
//6
//8
//10
//12
//14
//16
//注意:
//Split...Select...中Split只是对流中的数据打上标记,并没有将流真正拆分。可通过Select算子将流真正拆分出来。
//Split...Select...不能连续分流。即不能Split...Select...Split,但可以如Split...Select...Filter...Split。
Side-Output
/**
* Summary:
* Side-Output: Split...Select...已经过时,推荐使用更灵活的侧路输出(Side-Output),如下。
* Side-Output是从Flink 1.3.0开始提供的功能,支持了更灵活的多路输出,包括使用RichProcessFunction。
* Side-Output可以以侧流的形式,以不同于主流的数据类型,向下游输出指定条件的数据、异常数据、迟到数据等等
*/
public class DataStreamSideOperator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env=StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<UserBean> dataStream=env.fromElements(
new UserBean("userID1", 1293984000, "click", "productID1", 10),
new UserBean("userID1", 1293984000, "browse", "productID1", 10),
new UserBean("userID2", 1292952000, "browse", "productID2", 20),
new UserBean("userID2", 1243983000, "click", "productID2", 15),
new UserBean("userID3", 1293184000, "click", "productID1", 30)
);
//定义OutputTag
final OutputTag<UserBean> clickTag=new OutputTag<UserBean>("click-Tag", TypeInformation.of(UserBean.class));
final OutputTag<UserBean> browseTag=new OutputTag<UserBean>("browse-Tag",TypeInformation.of(UserBean.class));
//在ProcessFunction中处理主流和分流
SingleOutputStreamOperator<UserBean> processedStream=
dataStream.process(new ProcessFunction<UserBean, UserBean>() {
@Override
public void processElement(UserBean userAction, Context context, Collector<UserBean> collector) throws Exception {
//测流-只输出特定的数据
if(userAction.getEventType().equals("click")){
context.output(clickTag,userAction);
//主流
}else {
collector.collect(userAction);
}
}
});
//获取主流
processedStream.print("主流输出B:");
//获取测流
processedStream.getSideOutput(clickTag).print("分流输出A:");
env.execute("flink Side-Output operator");
}
}
Reduce
/**
* Summary:
* Reduce: 基于ReduceFunction进行滚动聚合,并向下游算子输出每次滚动聚合后的结果。
*/
public class DataStreamReduceOperator {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 输入: 用户行为。某个用户在某个时刻点击或浏览了某个商品,以及商品的价格。
DataStreamSource<UserBean> source = env.fromCollection(Arrays.asList(
new UserBean("userID1", 1293984000, "click", "productID1", 10),
new UserBean("userID2", 1293984001, "browse", "productID2", 8),
new UserBean("userID2", 1293984002, "browse", "productID2", 8),
new UserBean("userID2", 1293984003, "browse", "productID2", 8),
new UserBean("userID1", 1293984002, "click", "productID1", 10),
new UserBean("userID1", 1293984003, "click", "productID3", 10),
new UserBean("userID1", 1293984004, "click", "productID1", 10)
));
// 转换: KeyBy对数据重分区
KeyedStream<UserBean, String> keyedStream = source.keyBy(new KeySelector<UserBean, String>() {
@Override
public String getKey(UserBean value) throws Exception {
return value.getUserID();
}
});
// 转换: Reduce滚动聚合。这里,滚动聚合每个用户对应的商品总价格。
SingleOutputStreamOperator<UserBean> result = keyedStream.reduce(new ReduceFunction<UserBean>() {
@Override
public UserBean reduce(UserBean value1, UserBean value2) throws Exception {
int newProductPrice = value1.getProductPrice() + value2.getProductPrice();
return new UserBean(value1.getUserID(), -1, "", "", newProductPrice);
}
});
result.print();
env.execute("flink reduce operator");
}
}
// 输出: 将每次滚动聚合后的结果输出到控制台。
//3> UserAction(userID=userID2, eventTime=1293984001, eventType=browse, productID=productID2, productPrice=8)
//3> UserAction(userID=userID2, eventTime=-1, eventType=, productID=, productPrice=16)
//3> UserAction(userID=userID2, eventTime=-1, eventType=, productID=, productPrice=24)
//4> UserAction(userID=userID1, eventTime=1293984000, eventType=click, productID=productID1, productPrice=10)
//4> UserAction(userID=userID1, eventTime=-1, eventType=, productID=, productPrice=20)
//4> UserAction(userID=userID1, eventTime=-1, eventType=, productID=, productPrice=30)
//4> UserAction(userID=userID1, eventTime=-1, eventType=, productID=, productPrice=40)
Map
/**
* Summary:
* Map: 一对一转换
*/
public class DataStreamMapOperator {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 输入: 用户行为。某个用户在某个时刻点击或浏览了某个商品,以及商品的价格。
DataStreamSource<UserBean> source = env.fromCollection(Arrays.asList(
new UserBean("userID1", 1293984000, "click", "productID1", 10),
new UserBean("userID2", 1293984001, "browse", "productID2", 8),
new UserBean("userID1", 1293984002, "click", "productID1", 10)
));
// 转换: 商品的价格乘以8
SingleOutputStreamOperator<UserBean> result = source.map(new MapFunction<UserBean, UserBean>() {
@Override
public UserBean map(UserBean value) throws Exception {
int newPrice = value.getProductPrice() * 8;
return new UserBean(value.getUserID(), value.getEventTime(), value.getEventType(), value.getProductID(), newPrice);
}
});
result.print();
env.execute("flink map operator");
}
}
// 输出: 输出到控制台
// UserAction(userID=userID1, eventTime=1293984002, eventType=click, productID=productID1, productPrice=80)
// UserAction(userID=userID1, eventTime=1293984000, eventType=click, productID=productID1, productPrice=80)
// UserAction(userID=userID2, eventTime=1293984001, eventType=browse, productID=productID2, productPrice=64)
Keyby
/**
* Summary:
* KeyBy: 按指定的Key对数据重分区。将同一Key的数据放到同一个分区。
*/
public class DataStreamKeyByOperator {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 输入: 用户行为。某个用户在某个时刻点击或浏览了某个商品,以及商品的价格。
DataStreamSource<UserBean> source = env.fromCollection(Arrays.asList(
new UserBean("userID1", 1293984000, "click", "productID1", 10),
new UserBean("userID2", 1293984001, "browse", "productID2", 8),
new UserBean("userID1", 1293984002, "click", "productID1", 10),
new UserBean("userID3", 1293984002, "click", "productID1", 10),
new UserBean("userID1", 1293984002, "click", "productID1", 10),
new UserBean("userID2", 1293984002, "click", "productID1", 10)
));
// 转换: 按指定的Key(这里,用户ID)对数据重分区,将相同Key(用户ID)的数据分到同一个分区
KeyedStream<UserBean, String> result = source.keyBy(new KeySelector<UserBean, String>() {
@Override
public String getKey(UserBean value) throws Exception {
return value.getUserID();
}
});
result.print().setParallelism(4);
env.execute("flink keyby operator");
}
}
// 输出: 输出到控制台
//3> UserAction(userID=userID1, eventTime=1293984000, eventType=click, productID=productID1, productPrice=10)
//3> UserAction(userID=userID1, eventTime=1293984002, eventType=click, productID=productID1, productPrice=10)
//2> UserAction(userID=userID2, eventTime=1293984001, eventType=browse, productID=productID2, productPrice=8)
Join
/**
* Summary:
* join: 根据指定的Key将两个流进行关联。
*/
public class DataStreamJoinOperator {
public static void main(String[] args) throws Exception {
//1.获取执行环境配置信息
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.定义加载或创建数据源(source),监听9000端口的socket消息
DataStream<UserBean> textStream1 = env.fromElements(
new UserBean("userID1", 1293914003, "browse", "productID2", 8),
new UserBean("userID1", 1292984002, "click", "productID1", 10),
new UserBean("userID3", 1293384003, "click", "productID3", 10)
);
DataStream<UserBean> textStream2 = env.fromElements(
new UserBean("userID1", 1293984003, "browse", "productID2", 8),
new UserBean("userID1", 1293924002, "click", "productID1", 10),
new UserBean("userID3", 1293914103, "click", "productID1", 10)
);
//将输入处理一下,变为tuple2
DataStream<UserBean> mapStream1=textStream1
.map(new MapFunction<UserBean, UserBean>() {
@Override
public UserBean map(UserBean userAction) throws Exception {
//userAction.setProductID("mapStream1");
return userAction;
}
});
DataStream<UserBean> mapStream2=textStream2
.map(new MapFunction<UserBean, UserBean>() {
@Override
public UserBean map(UserBean userAction) throws Exception {
//userAction.setProductID("mapStream2");
return userAction;
}
});
//3.两个流进行join操作,是inner join,关联上的才能保留下来
DataStream<String> result = mapStream1.join(mapStream2)
//关联条件,以第0列关联(两个source输入的字符串)
.where(t1->t1.getUserID()).equalTo(t2->t2.getUserID())
//以处理时间,每10秒一个滚动窗口
.window(TumblingProcessingTimeWindows.of(Time.seconds(10)))
//关联后输出
.apply((t1,t2)->t1+"|"+t2)
;
//4.打印输出sink
result.print();
//5.开始执行
env.execute();
}
}
Fold
/**
* Summary:
* Fold: 基于初始值和自定义的FoldFunction滚动折叠后发出新值
*/
public class DataStreamFoldOperator {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 输入: 用户行为。某个用户在某个时刻点击或浏览了某个商品,以及商品的价格。
DataStreamSource<UserBean> source = env.fromCollection(Arrays.asList(
new UserBean("userID1", 1293984000, "click", "productID1", 10),
new UserBean("userID2", 1293984001, "browse", "productID2", 8),
new UserBean("userID2", 1293984002, "browse", "productID2", 8),
new UserBean("userID2", 1293984003, "browse", "productID2", 8),
new UserBean("userID1", 1293984002, "click", "productID1", 10),
new UserBean("userID1", 1293984003, "click", "productID3", 10),
new UserBean("userID1", 1293984004, "click", "productID1", 10)
));
// 转换: KeyBy对数据重分区
KeyedStream<UserBean, String> keyedStream = source.keyBy(new KeySelector<UserBean, String>() {
@Override
public String getKey(UserBean value) throws Exception {
return value.getUserID();
}
});
// 转换: Fold 基于初始值和FoldFunction滚动折叠
SingleOutputStreamOperator<String> result = keyedStream.fold("浏览的商品及价格:", new FoldFunction<UserBean, String>() {
@Override
public String fold(String accumulator, UserBean value) throws Exception {
if(accumulator.startsWith("userID")){
return accumulator + " -> " + value.getProductID()+":"+value.getProductPrice();
}else {
return value.getUserID()+" " +accumulator + " -> " + value.getProductID()+":"+value.getProductPrice();
}
}
});
result.print();
env.execute("flink fold operator");
}
}
// 输出: 输出到控制台
// 每一条数据都会触发计算并输出
// userID1 浏览的商品及价格: -> productID1:10
// userID1 浏览的商品及价格: -> productID1:10 -> productID1:10
// userID1 浏览的商品及价格: -> productID1:10 -> productID1:10 -> productID3:10
// userID1 浏览的商品及价格: -> productID1:10 -> productID1:10 -> productID3:10 -> productID1:10
// userID2 浏览的商品及价格: -> productID2:8
// userID2 浏览的商品及价格: -> productID2:8 -> productID2:8
// userID2 浏览的商品及价格: -> productID2:8 -> productID2:8 -> productID2:8
FlatMap
/**
* Summary:
* FlatMap: 一行变任意行(0~多行)
*/
public class DataStreamFlatMapOperator {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 输入: 英文电影台词
DataStreamSource<String> source = env
.fromElements(
"You jump I jump",
"Life was like a box of chocolates"
);
// 转换: 将包含chocolates的句子转换为每行一个单词
SingleOutputStreamOperator<String> result = source.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
if(value.contains("chocolates")){
String[] words = value.split(" ");
for (String word : words) {
out.collect(word);
}
}
}
});
result.print();
env.execute("flink flatmap operator");
}
}
// 输出: 输出到控制台
// Life
// was
// like
// a
// box
// of
// chocolates
Fliter
/**
* Summary:
* Fliter: 过滤出需要的数据
*/
public class DataStreamFilterOperator {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 输入: 用户行为。某个用户在某个时刻点击或浏览了某个商品,以及商品的价格。
DataStreamSource<UserBean> source = env.fromCollection(Arrays.asList(
new UserBean("userID1", 1293984000, "click", "productID1", 10),
new UserBean("userID2", 1293984001, "browse", "productID2", 8),
new UserBean("userID1", 1293984002, "click", "productID1", 10)
));
// 过滤: 过滤出用户ID为userID1的用户行为
SingleOutputStreamOperator<UserBean> result = source.filter(new FilterFunction<UserBean>() {
@Override
public boolean filter(UserBean value) throws Exception {
return value.getUserID().equals("userID1");
}
});
result.print();
env.execute("flink filter operator");
}
}
// 输出: 输出到控制台
// UserAction(userID=userID1, eventTime=1293984002, eventType=click, productID=productID1, productPrice=10)
// UserAction(userID=userID1, eventTime=1293984000, eventType=click, productID=productID1, productPrice=10)
心得
更多算子请关注flink官网