上篇:keyBy的4种用法
当下在java里,Tuple只支持25个,如果Tuple数量太多如何解决?
采用自定义javaBean方式实现,代码演示
package cn._51doit.flink.day03;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* key的用法【无界流数据】
* 解决问题:为了解决Tuple只支持25个的数量限制,采用自定义javaBean方式实现
* 实现功能:按本节点的nc -lk 8888命令下,对输入数据信息的进行累计,输入数据的按条件去进行合并
*/
public class KeyedDemo06 {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
//省份、城市、金额
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
SingleOutputStreamOperator<DataBean> provinceCityAndMoney = lines.map(new MapFunction<String,DataBean>() {
@Override
public DataBean map(String value) throws Exception {
String[] fields = value.split(",");
return new DataBean(fields[0], fields[1], Integer.parseInt(fields[2]));
}
});
KeyedStream<DataBean, Tuple> keyed = provinceCityAndMoney.keyBy("province", "city");
keyed.sum("money").print();
env.execute();
}
public static class DataBean{
public String province;
public String city;
public Integer money;
public DataBean() {}
public DataBean(String province, String city, Integer money) {
this.province = province;
this.city = city;
this.money = money;
}
@Override
public String toString() {
return "DataBean{" +
"province='" + province + '\'' +
", city='" + city + '\'' +
", money=" + money +
'}';
}
}
}
控制台打印输出
需求:
对输入的单词按做汇总操作,把输入的条件参数作为整体只会统计出现的次数
第一种方式:重写getKey方法
代码实现
package cn._51doit.flink.day03;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* key的用法
* 重写getKey方法【无界流数据】
* 实现功能:按本节点的nc -lk 8888命令下,对输入的单词按做汇总操作,把输入的条件参数作为整体
* 只会统计出现的次数
*
* 比如输入参数:
* 上海市,徐汇区,500
* 上海市,徐汇区,2500
* 上海市,徐汇区,500
*
* 控制台打印信息:
* 1> (上海市,徐汇区,500,1)
* 4> (上海市,徐汇区,2500,1)
* 1> (上海市,徐汇区,500,2)
* 4> (上海市,徐汇区,2500,2)
* 1> (上海市,徐汇区,500,3)
* 1> (上海市,徐汇区,500,4)
* 4> (上海市,徐汇区,2500,3)
* 1> (上海市,徐汇区,500,5)
*
* 说明:比如,(上海市,徐汇区,500,5)例子中,把上“上海市,徐汇区,500”作为整体,5作为出现的次数
*/
public class KeyedDemo07 {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
//spark
DataStreamSource<String> words = env.socketTextStream("Master", 8888);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = words.map(w -> Tuple2.of(w, 1)).returns(Types.TUPLE(Types.STRING,Types.INT));
//keyBy从0开始累加打印
KeyedStream<Tuple2<String, Integer>, String> keyed = wordAndOne.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
@Override
public String getKey(Tuple2<String, Integer> value) throws Exception {
return value.f0; //按字符串进行分组
}
});
keyed.sum(1).print();
env.execute();
}
}
控制台打印输出:
第二种方式:采用Lambda表达式的使用
package cn._51doit.flink.day03;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* key的用法
* Lambda表达式的使用【无限流数据】
* 实现功能:按本节点的nc -lk 8888命令下,对输入的单词按做汇总操作,把输入的条件参数作为整体
* 只会统计出现的次数
*
* 比如输入参数:
* 上海市,徐汇区,500
* 上海市,徐汇区,2500
* 上海市,徐汇区,500
*
* 控制台打印信息:
* 1> (上海市,徐汇区,500,1)
* 4> (上海市,徐汇区,2500,1)
* 1> (上海市,徐汇区,500,2)
* 4> (上海市,徐汇区,2500,2)
* 1> (上海市,徐汇区,500,3)
* 1> (上海市,徐汇区,500,4)
* 4> (上海市,徐汇区,2500,3)
* 1> (上海市,徐汇区,500,5)
*
* 说明:比如,(上海市,徐汇区,500,5)例子中,把上“上海市,徐汇区,500”作为整体,5作为出现的次数
*/
public class KeyedDemo08 {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
//spark
DataStreamSource<String> words = env.socketTextStream("Master", 8888);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = words.map(w -> Tuple2.of(w, 1)).returns(Types.TUPLE(Types.STRING,Types.INT));
//Lambda表达式的使用
//keyBy从0开始累加打印
KeyedStream<Tuple2<String, Integer>, String> keyed = wordAndOne.keyBy(t -> t.f0);
keyed.sum(1).print();
env.execute();
}
}
keyBy底层如何实现?
(1)在底层KeyedStream类查看,发现它会new ReduceTransformation,接着还会返回
new SingleOutputStreamOperator,采用聚合算子reduce进行传参,它会通过clean检测闭包检测是否被序列化
(2)程序真正工作采用聚合算子reduce,这个算子ReduceFunction接口类里,有个reduce方法经过聚合的结果进行更新,然后再输出
说明:以后查看源代码有两种方式
- 第一种,可以选择这个类,鼠标右键选择Find Usages,根据控制台的提示观察
- 第二种方式,通过debug打断点去观察
(3)程序之所以可以累加,主要是把中间的结果保存起来了
reduce源码分析
(1)ctrl+n搜索“StreamGroupedReduceOperator”,查看,有输入IN参数,输出ReduceFunction参数
(2)在聚合函数reducer会传到父类里,在执行的时候会拿到userFunction,而reducer是自己实现按照聚合逻辑把数据进行运算,运算完毕之后会更新最新状态,最后把数据输出
keyBy底层源码分析
(1)在“keyBy”点击进去查看,发现生成KeyedStream的实例,然后把父类的实例包了一层,并对key进行clean检测闭包检测是否被序列化
(2)在KeyedStream里面有个重载的方法,有个父类的dataStream,得到一个新的PartitionTransformation,而KeyGroupStreamPartitioner是对进行分区分
(3)KeyGroupStreamPartitioner里面有几个重要的方法,比如:selectChannel方法调用getkey是自己实现的,接着还调用assignKeyToParallelOperator方法