Flink算子（KeyBy的源码分析及案例）

最新推荐文章于 2024-06-12 09:35:10 发布

undo_try

最新推荐文章于 2024-06-12 09:35:10 发布

阅读量3.9k

点赞数 5

分类专栏： # flink

本文链接：https://blog.csdn.net/qq_44665283/article/details/118400669

版权

flink 专栏收录该内容

26 篇文章 16 订阅

订阅专栏

在Flink中，Transformation主要负责对属于的转换操作，调用Transformation后会生成一个新的DataStream。

1、KeyBy的源码分析

总结：
保证key相同的一定进入到一个分区内，但是一个分区内可以有多key的数据；
是对数据进行实时的分区，不是上游发送给下游，而是将数据写入到对应的channel的缓存中，下游到上游实时拉取；
keyBy底层是new KeyedStream，然后将父DataStream包起来，并且传入keyBy的条件（keySelector）；
最终会调用KeyGroupStreamPartitioner的selectChannel方法，将keyBy的条件的返回值传入到该方法中；
流程：
1.先计算key的HashCode值（有可能会是负的）
2将key的HashCode值进行特殊的hash处理，MathUtils.murmurHash(keyHash)，一定返回正数，避免返回的数字为负
3.将返回特特殊的hash值模除以默认最大并行的，默认是128，得到keyGroupId
4.keyGroupId * parallelism（程序的并行度） / maxParallelism（默认最大并行），返回分区编号
注意：1.如果将自定义POJO当成key，必须重新hashcode方法，2.不能将数组当成keyBy的key

public <K> KeyedStream<T, K> keyBy(KeySelector<T, K> key) {
        Preconditions.checkNotNull(key);
        return new KeyedStream<>(this, clean(key));
    }


public KeyedStream(
            DataStream<T> dataStream,
            KeySelector<T, KEY> keySelector,
            TypeInformation<KEY> keyType) {
        this(
                dataStream,
                new PartitionTransformation<>(
                        dataStream.getTransformation(),
                        new KeyGrouppublic KeyGroupStreamPartitioner(KeySelector<T, K> keySelector, int maxParallelism) {
        Preconditions.checkArgument(maxParallelism > 0, "Number of key-groups must be > 0!");
        this.keySelector = Preconditions.checkNotNull(keySelector);
        this.maxParallelism = maxParallelism;
    }

    public int getMaxParallelism() {
        return maxParallelism;
    }

    @Override
    public int selectChannel(SerializationDelegate<StreamRecord<T>> record) {
        K key;
        try {
            key = keySelector.getKey(record.getInstance().getValue());
        } catch (Exception e) {
            throw new RuntimeException(
                    "Could not extract key from " + record.getInstance().getValue(), e);
        }
        return KeyGroupRangeAssignment.assignKeyToParallelOperator(
                key, maxParallelism, numberOfChannels);
    }StreamPartitioner<>(
                                keySelector,
                                StreamGraphGenerator.DEFAULT_LOWER_BOUND_MAX_PARALLELISM)),
                                keySelector,
                                keyType);
    }

public static int assignKeyToParallelOperator(Object key, int maxParallelism, int parallelism) {
        Preconditions.checkNotNull(key, "Assigned key must not be null!");
        return computeOperatorIndexForKeyGroup(maxParallelism, parallelism, assignToKeyGroup(key, maxParallelism));
    }

    public static int assignToKeyGroup(Object key, int maxParallelism) {
        Preconditions.checkNotNull(key, "Assigned key must not be null!");
        return computeKeyGroupForKeyHash(key.hashCode(), maxParallelism);
    }
    
// 将key的HashCode值进行特殊的hash处理，MathUtils.murmurHash(keyHash)，一定返回正数，避免返回的数字为负
public static int computeKeyGroupForKeyHash(int keyHash, int maxParallelism) {
        return MathUtils.murmurHash(keyHash) % maxParallelism;
    }
// keyGroupId * parallelism（程序的并行度） / maxParallelism（默认最大并行），返回分区编号
public static int computeKeyGroupForKeyHash(int keyHash, int maxParallelism) {
        return MathUtils.murmurHash(keyHash) % maxParallelism;
    }

2、KeyBy的案例

import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

public class _04_KeyedStream {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
        DataStreamSource<String> source = env.socketTextStream("centos01", 8888);
        
        /*
         第一种方式：过时的方式，利用下标及字段名字进行分组：
         Tuple2<String, Integer> 组内数据的类型
        Tuple分组的条件
        key相同的数据，一定会进入到同一个分区（同一个subtask中），一个分区中，可以有0到多个不同的组
        输入数据的下标，只能DataStream中的数据为TupleN类型时才能传入下标
        */
        SingleOutputStreamOperator<Tuple2<String, Integer>> flatMap = source.flatMap((String line, Collector<Tuple2<String, Integer>> out) -> {
            String[] words = line.split(" ");
            Tuple2<String, Integer> tuple2;
            for (String word : words) {
                tuple2 = Tuple2.of(word, 1);
                out.collect(tuple2);
            }
        }).returns(TypeInformation.of(new TypeHint<Tuple2<String, Integer>>() {}));

       //对第一列进行分组
        KeyedStream<Tuple2<String, Integer>, Tuple> keyedStream = flatMap.keyBy(0);

        //利用Tuple的字段名字进行分组
        KeyedStream<Tuple2<String, Integer>, Tuple> keyedStream = flatMap.keyBy("f0");

        //第二种方式：利用KeySelector进行分组
       KeyedStream<Tuple2<String, Integer>, String> keyedStream
               = flatMap.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
            @Override
            public String getKey(Tuple2<String, Integer> value) throws Exception {
                return value.f0;
            }
        });

         //第三种方式：利用KeySelector的lambda表达式进行分组,类型一致
         KeyedStream<Tuple2<String, Integer>, String> keyedStream = flatMap.keyBy(tp2 -> tp2.f0);

        //河南,周口,8000  利用省份  和   市区  进行联合分组,嵌套类型需要类型
       SingleOutputStreamOperator<Tuple3<String, String, Integer>> map = source.map(line -> {
            String[] words = line.split(",");
            String province = words[0];
            String city = words[1];
            int persons = Integer.parseInt(words[2]);
            return Tuple3.of(province, city, persons);
        }).returns(TypeInformation.of(new TypeHint<Tuple3<String, String,Integer>>() {}));

        KeyedStream<Tuple3<String, String, Integer>, Tuple2<String, String>> keyedStream =
                map.keyBy(
                        tp3 -> Tuple2.of(tp3.f0, tp3.f1),
                TypeInformation.of(new TypeHint<Tuple2<String, String>>() {}));


        //第四种方式：利用自定的类进行联合分组
        SingleOutputStreamOperator<_00_ProvinceAndCity> map = source.map(line -> {
            String[] words = line.split(",");
            String province = words[0];
            String city = words[1];
            int persons = Integer.parseInt(words[2]);
            return _00_ProvinceAndCity.of(province, city, persons);
        });

        KeyedStream<_00_ProvinceAndCity, _00_ProvinceAndCity> keyedStream = map.keyBy(p -> p);

        keyedStream.print();
        env.execute("_04_KeyedStream");
    }
}

将自定义POJO当成key，必须重写hashcode方法

public class _00_ProvinceAndCity {
    public String province;
    public String city;
    public int persons;

    public _00_ProvinceAndCity(String province, String city, int persons) {
        this.province = province;
        this.city = city;
        this.persons = persons;
    }
    //模仿Tuple
    public static  _00_ProvinceAndCity of (String province, String city, int persons){
        return new _00_ProvinceAndCity(province,city,persons);
    }

    //因为keyby是按照key的hashcode进行分组，因此需要重写hashcode和equals方法
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        _00_ProvinceAndCity that = (_00_ProvinceAndCity) o;
        return province.equals(that.province) &&
                city.equals(that.city);
    }
    @Override
    public int hashCode() {
        return Objects.hash(province, city);
    }
    @Override
    public String toString() {
        return "_00_ProvinceAndCity{" +
                "province='" + province + '\'' +
                ", city='" + city + '\'' +
                ", persons=" + persons +
                '}';
    }
}

undo_try

关注

5
点赞
踩
17

收藏

觉得还不错? 一键收藏
2
评论
Flink算子（KeyBy的源码分析及案例）

在Flink中，Transformation主要负责对属于的转换操作，调用Transformation后会生成一个新的DataStream。1、KeyBy的源码分析总结：保证key相同的一定进入到一个分区内，但是一个分区内可以有多key的数据；是对数据进行实时的分区，不是上游发送给下游，而是将数据写入到对应的channel的缓存中，下游到上游实时拉取；keyBy底层是new KeyedStream，然后将父DataStream包起来，并且传入keyBy的条件（keySelector）；最终会调用
复制链接

扫一扫