11_Flink Streaming partition

最新推荐文章于 2024-08-03 17:58:56 发布

codemosi

最新推荐文章于 2024-08-03 17:58:56 发布

阅读量2.5k

点赞数

分类专栏： flink streaming

本文链接：https://blog.csdn.net/codemosi/article/details/51620622

版权

flink的处理逻辑包括

1：transform算子类型（我们指定的map，filter转换算子）

2：operator（我们实现的Function具体业务）

3：partition（数据的分区分组划分，上一个算子处理后的数据，如何传输给下一个算子。）

StreamPartitioner是partition的接口。需要实现这个接口，才能实现数据传输的逻辑。

1：copy;

2：int[] selectChannels(T record, int numChannels);

channel是数据往下游传输的通道，带有下标index。可以通过selectChannels来设置不同的传输方式。

处理keyby通过重写datastream。这种按key的value做hash取模的分组方式外。其他的分组方式，都是包装在Datastream对象里。也可以实现自己的分组方式。通过partitionCustom，收到CustomPartitionerWrapper实现的限制，只能指定一个channel通道。

1：shuffle，随机的发给下一个算子

2：broadcast，往下游的所有算子，全部一次。

3：以及各种不常用的分组方式。

发生数据重分区就会涉及到序列化，和网络传输。

flink的序列化使用到了TypeSerializer序列化常见类型，涉及sun的Unsafe类和自己的内存管理实现。如果TypeSerializer序列化不了的就用kyro序列化。

flink的网络使用akka，涉及到网络的模块，很多都用scala写。

每个记录通过SerializationDelegate包装起来。

每一个逻辑分区有一个ResultPartition对象。RecordWriter持有一个ResultPartitionWriter可以选择往哪个写数据。每个任务持有一个RecordWriterOutput，每个RecordWriterOutput持有一个RecordWriter。

我们写的业务逻辑Function，持有一个Collector可以调用collect，持有RecordWriterOutput间接的调用RecordWriterOutput的那一套数据传输

数据先write到ResultPartition的ResultSubpartition对象里。该对象有两个实现类。一个使用iomanager模块的RequestQueue（LinkedBlockingQueue阻塞队列）。一个使用java.util.ArrayDeque双向队列。

通过ResultPartitionConsumableNotifier.notifyPartitionConsumable。通知这个ResultPartition对应的下游来消费数据。网络传输走akka，通过ActorGateway.ask进行网络通信。

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.io.network.api.writer;

import org.apache.flink.core.io.IOReadableWritable;
import org.apache.flink.runtime.accumulators.AccumulatorRegistry;
import org.apache.flink.runtime.event.AbstractEvent;
import org.apache.flink.runtime.io.network.api.serialization.RecordSerializer;
import org.apache.flink.runtime.io.network.api.serialization.SpanningRecordSerializer;
import org.apache.flink.runtime.io.network.buffer.Buffer;

import java.io.IOException;

import static org.apache.flink.runtime.io.network.api.serialization.RecordSerializer.SerializationResult;

/**
 * A record-oriented runtime result writer.
 * <p>
 * The RecordWriter wraps the runtime's {@link ResultPartitionWriter} and takes care of
 * serializing records into buffers.
 * <p>
 * <strong>Important</strong>: it is necessary to call {@link #flush()} after
 * all records have been written with {@link #emit(IOReadableWritable)}. This
 * ensures that all produced records are written to the output stream (incl.
 * partially filled ones).
 *
 * @param <T> the type of the record that can be emitted with this record writer
 */
public class RecordWriter<T extends IOReadableWritable> {

	protected final ResultPartitionWriter writer;

	private final ChannelSelector<T> channelSelector;

	private final int numChannels;

	/** {@link RecordSerializer} per outgoing channel */
	private final RecordSerializer<T>[] serializers;

	public RecordWriter(ResultPartitionWriter writer) {
		this(writer, new RoundRobinChannelSelector<T>());
	}

	@SuppressWarnings("unchecked")
	public RecordWriter(ResultPartitionWriter writer, ChannelSelector<T> channelSelector) {
		this.writer = writer;
		this.channelSelector = channelSelector;

		this.numChannels = writer.getNumberOfOutputChannels();

		/**
		 * The runtime exposes a channel abstraction for the produced results
		 * (see {@link ChannelSelector}). Every channel has an independent
		 * serializer.
		 */
		this.serializers = new SpanningRecordSerializer[numChannels];
		for (int i = 0; i < numChannels; i++) {
			serializers[i] = new SpanningRecordSerializer<T>();
		}
	}

	public void emit(T record) throws IOException, InterruptedException {
		for (int targetChannel : channelSelector.selectChannels(record, numChannels)) {
			// serialize with corresponding serializer and send full buffer
			RecordSerializer<T> serializer = serializers[targetChannel];

			synchronized (serializer) {
				SerializationResult result = serializer.addRecord(record);
				while (result.isFullBuffer()) {
					Buffer buffer = serializer.getCurrentBuffer();

					if (buffer != null) {
						writeBuffer(buffer, targetChannel, serializer);
					}

					buffer = writer.getBufferProvider().requestBufferBlocking();
					result = serializer.setNextBuffer(buffer);
				}
			}
		}
	}

	/**
	 * This is used to broadcast Streaming Watermarks in-band with records. This ignores
	 * the {@link ChannelSelector}.
	 */
	public void broadcastEmit(T record) throws IOException, InterruptedException {
		for (int targetChannel = 0; targetChannel < numChannels; targetChannel++) {
			// serialize with corresponding serializer and send full buffer
			RecordSerializer<T> serializer = serializers[targetChannel];

			synchronized (serializer) {
				SerializationResult result = serializer.addRecord(record);
				while (result.isFullBuffer()) {
					Buffer buffer = serializer.getCurrentBuffer();

					if (buffer != null) {
						writeBuffer(buffer, targetChannel, serializer);
					}

					buffer = writer.getBufferProvide