在Flink中,Transformation主要负责对属于的转换操作,调用Transformation后会生成一个新的DataStream。
1、KeyBy的源码分析
总结:
保证key相同的一定进入到一个分区内,但是一个分区内可以有多key的数据;
是对数据进行实时的分区,不是上游发送给下游,而是将数据写入到对应的channel的缓存中,下游到上游实时拉取;
keyBy底层是new KeyedStream,然后将父DataStream包起来,并且传入keyBy的条件(keySelector);
最终会调用KeyGroupStreamPartitioner的selectChannel方法,将keyBy的条件的返回值传入到该方法中;
流程:
1.先计算key的HashCode值(有可能会是负的)
2将key的HashCode值进行特殊的hash处理,MathUtils.murmurHash(keyHash),一定返回正数,避免返回的数字为负
3.将返回特特殊的hash值模除以默认最大并行的,默认是128,得到keyGroupId
4.keyGroupId * parallelism(程序的并行度) / maxParallelism(默认最大并行),返回分区编号
注意:1.如果将自定义POJO当成key,必须重新hashcode方法,2.不能将数组当成keyBy的key
public <K> KeyedStream<T, K> keyBy(KeySelector<T, K> key) {
Preconditions.checkNotNull(key);
return new KeyedStream<>(this, clean(key));
}
public KeyedStream(
DataStream<T> dataStream,
KeySelector<T, KEY> keySelector,
TypeInformation<KEY> keyType) {
this(
dataStream,
new PartitionTransformation<>(
dataStream.getTransformation(),
new KeyGrouppublic KeyGroupStreamPartitioner(KeySelector<T, K> keySelector, int maxParallelism) {
Preconditions.checkArgument(maxParallelism > 0, "Number of key-groups must be > 0!");
this.keySelector = Preconditions.checkNotNull(keySelector);
this.maxParallelism = maxParallelism;
}
public int getMaxParallelism() {
return maxParallelism;
}
@Override
public int selectChannel(SerializationDelegate<StreamRecord<T>> record) {
K key;
try {
key = keySelector.getKey(record.getInstance().getValue());
} catch (Exception e) {
throw new RuntimeException(
"Could not extract key from " + record.getInstance().getValue(), e);
}
return KeyGroupRangeAssignment.assignKeyToParallelOperator(
key, maxParallelism, numberOfChannels);
}StreamPartitioner<>(
keySelector,
StreamGraphGenerator.DEFAULT_LOWER_BOUND_MAX_PARALLELISM)),
keySelector,
keyType);
}
public static int assignKeyToParallelOperator(Object key, int maxParallelism, int parallelism) {
Preconditions.checkNotNull(key, "Assigned key must not be null!");
return computeOperatorIndexForKeyGroup(maxParallelism, parallelism, assignToKeyGroup(key, maxParallelism));
}
public static int assignToKeyGroup(Object key, int maxParallelism) {
Preconditions.checkNotNull(key, "Assigned key must not be null!");
return computeKeyGroupForKeyHash(key.hashCode(), maxParallelism);
}
// 将key的HashCode值进行特殊的hash处理,MathUtils.murmurHash(keyHash),一定返回正数,避免返回的数字为负
public static int computeKeyGroupForKeyHash(int keyHash, int maxParallelism) {
return MathUtils.murmurHash(keyHash) % maxParallelism;
}
// keyGroupId * parallelism(程序的并行度) / maxParallelism(默认最大并行),返回分区编号
public static int computeKeyGroupForKeyHash(int keyHash, int maxParallelism) {
return MathUtils.murmurHash(keyHash) % maxParallelism;
}
2、KeyBy的案例
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class _04_KeyedStream {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
DataStreamSource<String> source = env.socketTextStream("centos01", 8888);
/*
第一种方式:过时的方式,利用下标及字段名字进行分组:
Tuple2<String, Integer> 组内数据的类型
Tuple分组的条件
key相同的数据,一定会进入到同一个分区(同一个subtask中),一个分区中,可以有0到多个不同的组
输入数据的下标,只能DataStream中的数据为TupleN类型时才能传入下标
*/
SingleOutputStreamOperator<Tuple2<String, Integer>> flatMap = source.flatMap((String line, Collector<Tuple2<String, Integer>> out) -> {
String[] words = line.split(" ");
Tuple2<String, Integer> tuple2;
for (String word : words) {
tuple2 = Tuple2.of(word, 1);
out.collect(tuple2);
}
}).returns(TypeInformation.of(new TypeHint<Tuple2<String, Integer>>() {}));
//对第一列进行分组
KeyedStream<Tuple2<String, Integer>, Tuple> keyedStream = flatMap.keyBy(0);
//利用Tuple的字段名字进行分组
KeyedStream<Tuple2<String, Integer>, Tuple> keyedStream = flatMap.keyBy("f0");
//第二种方式:利用KeySelector进行分组
KeyedStream<Tuple2<String, Integer>, String> keyedStream
= flatMap.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
@Override
public String getKey(Tuple2<String, Integer> value) throws Exception {
return value.f0;
}
});
//第三种方式:利用KeySelector的lambda表达式进行分组,类型一致
KeyedStream<Tuple2<String, Integer>, String> keyedStream = flatMap.keyBy(tp2 -> tp2.f0);
//河南,周口,8000 利用省份 和 市区 进行联合分组,嵌套类型需要类型
SingleOutputStreamOperator<Tuple3<String, String, Integer>> map = source.map(line -> {
String[] words = line.split(",");
String province = words[0];
String city = words[1];
int persons = Integer.parseInt(words[2]);
return Tuple3.of(province, city, persons);
}).returns(TypeInformation.of(new TypeHint<Tuple3<String, String,Integer>>() {}));
KeyedStream<Tuple3<String, String, Integer>, Tuple2<String, String>> keyedStream =
map.keyBy(
tp3 -> Tuple2.of(tp3.f0, tp3.f1),
TypeInformation.of(new TypeHint<Tuple2<String, String>>() {}));
//第四种方式:利用自定的类进行联合分组
SingleOutputStreamOperator<_00_ProvinceAndCity> map = source.map(line -> {
String[] words = line.split(",");
String province = words[0];
String city = words[1];
int persons = Integer.parseInt(words[2]);
return _00_ProvinceAndCity.of(province, city, persons);
});
KeyedStream<_00_ProvinceAndCity, _00_ProvinceAndCity> keyedStream = map.keyBy(p -> p);
keyedStream.print();
env.execute("_04_KeyedStream");
}
}
将自定义POJO当成key,必须重写hashcode方法
public class _00_ProvinceAndCity {
public String province;
public String city;
public int persons;
public _00_ProvinceAndCity(String province, String city, int persons) {
this.province = province;
this.city = city;
this.persons = persons;
}
//模仿Tuple
public static _00_ProvinceAndCity of (String province, String city, int persons){
return new _00_ProvinceAndCity(province,city,persons);
}
//因为keyby是按照key的hashcode进行分组,因此需要重写hashcode和equals方法
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
_00_ProvinceAndCity that = (_00_ProvinceAndCity) o;
return province.equals(that.province) &&
city.equals(that.city);
}
@Override
public int hashCode() {
return Objects.hash(province, city);
}
@Override
public String toString() {
return "_00_ProvinceAndCity{" +
"province='" + province + '\'' +
", city='" + city + '\'' +
", persons=" + persons +
'}';
}
}