GroupBy 算子:基于指定字段分组(类似于 SQL 中的 group by 分组),对数据分组结果进行聚合统计。
示例环境
java.version: 1.8.x
flink.version: 1.11.1
示例数据源 (项目码云下载)
GroupBy.java
package com.flink.examples.functions;
import com.flink.examples.DataSource;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;
import java.util.List;
import static org.apache.flink.table.api.Expressions.$;
/**
* @Description GroupBy方法:基于指定字段分组(类似于SQL中的group by分组)
*/
public class GroupBy {
/**
* 对数据分组结果进行聚合统计
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//构建StreamTableEnvironment
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
List<Tuple3<String,String,Integer>> tuple3List = DataSource.getTuple3ToList();
DataStream<Tuple3<String,String,Integer>> dataStream = env.fromCollection(tuple3List);
Table table = tEnv.fromDataStream(dataStream, $("name"), $("sex"), $("age"));
Table counts = table
// 过滤age<=20
.filter($("age").isGreater(20))
// 过滤name=null
.filter($("name").isNotNull())
// 按sex分组
.groupBy($("sex"))
// 对不同的字段进聚合计算:sex,name个数,age合计
.select( $("sex"), $("name").count(), $("age").sum());
DataStream<Tuple2<Boolean, Row>> behaviorStream = tEnv.toRetractStream(counts, Row.class);
behaviorStream.flatMap(new FlatMapFunction<Tuple2<Boolean, Row>, Object>() {
@Override
publicvoid flatMap(Tuple2<Boolean, Row> value, Collector<Object> out) {
if (value.f0) {
out.collect(value.f1);
}
}
}).print();
env.execute("flink groupBy job");
}
}
打印结果
4> man,1,29
2> girl,1,24
2> girl,2,56
4> man,2,59