HDFS-RAID从Hadoop-0.22.0迁移到Hadoop-2.2.0

最新推荐文章于 2024-04-25 13:07:45 发布

tokendeng

最新推荐文章于 2024-04-25 13:07:45 发布

阅读量1.9k

点赞数

分类专栏： Erasure Code

本文链接：https://blog.csdn.net/u013281331/article/details/27057261

版权

HDFS-RAID在Hadoop-0.22.0版本上作为一个附加组件发布，与0.22.0版本兼容使用。奇怪的是后续Haddoop团队并未继续对HDFS-RAID维护。</span><span style="font-size: 18px; font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">要想在最新的2.2.0版本上使用HDFS-RAID功能，要涉及许多代码的修改。主要分为以下三点：

第一：重写RaidBlockSender类。

RaidBlockSender是HDFS-RAID内部的Block发送器，当用过EC恢复一个corrupt的Block后，要把此Block发往目标节点。由于0.22.0到2.2.0版本跨度太大，Hadoop自身新加了很多新特性，主要体现在HA和Fedearation两个功能，所以底层代码修改比较多。主要体现在由之前的Block扩展到了ExtendedBlock，以及之前的Writable序列机制过度的到Google的Protobuf。前者主要简单的扩展字段，后者涉及到BlockSender的实现不同。而之前HDFS-RAID的RaidBlockSender是模仿Hadoop自身的BlockSender写的，所以迁移到2.2.0时，要模范2.2.0的BlockSender重写一个RaidBlockSender。重写后如下：

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.datanode;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Arrays;

import org.apache.commons.logging.Log;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ReadaheadPool.ReadaheadRequest;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.net.SocketOutputStream;
import org.apache.hadoop.util.DataChecksum;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

/**
 * in order to update the HDFS-RAID from Hadoop-0.22.0 to Hadoop-2.2.0,
 * we need to modify the class (@see RaidBlockSender) through chaning 
 * the way of translating one block to another DataNode.
 * Basically, this class is totally different from the traditional class
 * (@see RaidBlockSender),we just change some code of (@see BlockSender)
 * in order to cater to our requirement after analyzing the principle
 * behind (@see BlockSender).
 * 
 * @author Deng Changchun
 * @version 2
 * @since 2014-05-24
 */
public class RaidBlockSender2 implements java.io.Closeable {
	static final Log LOG = DataNode.LOG;
	static final Log ClientTraceLog = DataNode.ClientTraceLog;

	/**
	 * Minimum buffer used while sending data to clients. Used only if
	 * transferTo() is enabled. 64KB is not that large. It could be larger, but
	 * not sure if there will be much more improvement.
	 */
	private static final int MIN_BUFFER_WITH_TRANSFERTO = 64 * 1024;
	private static final int TRANSFERTO_BUFFER_SIZE = Math.max(
			HdfsConstants.IO_FILE_BUFFER_SIZE, MIN_BUFFER_WITH_TRANSFERTO);

	/** the block to read from */
	private final ExtendedBlock block;
	/** Stream to read block data from */
	private InputStream blockIn;
	/** updated while using transferTo() */
	private long blockInPosition = -1;
	/** Stream to read checksum */
	private DataInputStream checksumIn;
	/** Checksum utility */
	private final DataChecksum checksum;
	/** Initial position to read */
	private long initialOffset;
	/** Current position of read */
	private long offset;
	/** Position of last byte to read from block file */
	private final long endOffset;
	/** Number of bytes in chunk used for computing checksum */
	private final int chunkSize;
	/** Number bytes of checksum computed for a chunk */
	private final int checksumSize;
	/** If true, failure to read checksum is ignored */
	private final boolean corruptChecksumOk;
	/** Sequence number of packet being sent */
	private long seqno;
	/** Set to true if transferTo is allowed for sending data to the client */
	private final boolean transferToAllowed;
	/** Set to true once entire requested byte range has been sent to the client */
	private boolean sentEntireByteRange;
	/** When true, verify checksum while reading from checksum file */
	private final boolean verifyChecksum;
	/** Format used to print client trace log messages */
	private final String clientTraceFmt;
	private volatile ChunkChecksum lastChunkChecksum = null;

	/** The file descriptor of the block being sent */
	private FileDescriptor blockInFd;

	// Cache-management related fields
	private final long readaheadLength;

	private ReadaheadRequest curReadahead;

	private final boolean alwaysReadahead;

	private final boolean dropCacheBehindLargeReads;

	private final boolean dropCacheBehindAllReads;

	private long lastCacheDropOffset;

	@VisibleForTesting
	static long CACHE_DROP_INTERVAL_BYTES = 1024 * 1024; // 1MB

	/**
	 * See {
  {@link BlockSender#isLongRead()}
	 */
	private static final long LONG_READ_THRESHOLD_BYTES = 256 * 1024;

	/**
	 * Constructor
	 * 
	 * @param block
	 *            Block that is being read
	 * @param startOffset
	 *            starting offset to read from
	 * @param length
	 *            length of data to read
	 * @param corruptChecksumOk
	 * @param verifyChecksum
	 *            verify checksum while reading the data
	 * @param sendChecksum
	 *            send checksum to client.
	 * @param datanode
	 *            datanode from which the block is being read
	 * @param clientTraceFmt
	 *            format string used to print client trace logs
	 * @throws IOException
	 */
	public RaidBlockSender2(ExtendedBlock block, long blockLength, 
			long startOffset,long length, 
			boolean corruptChecksumOk, boolean verifyChecksum,
			boolean sendChecksum, boolean transferToAllowed,
			DataInputStream metadataIn, InputStreamFactory streamFactory,
			String clientTraceFmt, CachingStrategy cachingStrategy)
			throws IOException {
		try {
			this.block = block;
			this.corruptChecksumOk = corruptChecksumOk;
			this.verifyChecksum = verifyChecksum;
			this.clientTraceFmt = clientTraceFmt;

			/*
			 * If the client asked for the cache to be dropped behind all reads,
			 * we honor that. Otherwise, we use the DataNode defaults. When
			 * using DataNode defaults, we use a heuristic where we only drop
			 * the cache for large reads.
			 */
			if (cachingStrategy.getDropBehind() == null) {
				this.dropCacheBehindAllReads = false;
				this.dropCacheBehindLargeReads = DFSConfigKeys.DFS_DATANODE_DROP_CACHE_BEHIND_READS_DEFAULT;
			} else {
				this.dropCacheBehindAllReads = this.dropCacheBehindLargeReads = cachingStrategy
						.getDropBehind().booleanValue();
			}
			/*
			 * Similarly, if readahead was explicitly requested, we always do
			 * it. Otherwise, we read ahead based on the DataNode settings, and
			 * only when the reads are large.
			 */
			if (cachingStrategy.getReadahead() == null) {
				this.alwaysReadahead = false;
				this.readaheadLength = DFSConfigKeys.DFS_DATANODE_READAHEAD_BYTES_DEFAULT;
			} else {
				this.alwaysReadahead = true;
				this.readaheadLength = cachingStrategy.getReadahead()
						.longValue();
			}

			if (verifyChecksum) {
				// To simplify implementation, callers may not specify
				// verification
				// without sending.
				Preconditions.checkArgument(sendChecksum,
						"If verifying checksum, currently must also send it.");
			}

			final Replica replica;
			final long replicaVisibleLength = blockLength;
			;

			// transferToFully() fails on 32 bit platforms for block sizes >=
			// 2GB,
			// use normal transfe