HDFS-RAID在Hadoop-0.22.0版本上作为一个附加组件发布,与0.22.0版本兼容使用。奇怪的是后续Haddoop团队并未继续对HDFS-RAID维护。</span><span style="font-size: 18px; font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">要想在最新的2.2.0版本上使用HDFS-RAID功能,要涉及许多代码的修改。主要分为以下三点:
第一:重写RaidBlockSender类。
RaidBlockSender是HDFS-RAID内部的Block发送器,当用过EC恢复一个corrupt的Block后,要把此Block发往目标节点。由于0.22.0到2.2.0版本跨度太大,Hadoop自身新加了很多新特性,主要体现在HA和Fedearation两个功能,所以底层代码修改比较多。主要体现在由之前的Block扩展到了ExtendedBlock,以及之前的Writable序列机制过度的到Google的Protobuf。前者主要简单的扩展字段,后者涉及到BlockSender的实现不同。而之前HDFS-RAID的RaidBlockSender是模仿Hadoop自身的BlockSender写的,所以迁移到2.2.0时,要模范2.2.0的BlockSender重写一个RaidBlockSender。重写后如下:
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ReadaheadPool.ReadaheadRequest;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.net.SocketOutputStream;
import org.apache.hadoop.util.DataChecksum;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
/**
* in order to update the HDFS-RAID from Hadoop-0.22.0 to Hadoop-2.2.0,
* we need to modify the class (@see RaidBlockSender) through chaning
* the way of translating one block to another DataNode.
* Basically, this class is totally different from the traditional class
* (@see RaidBlockSender),we just change some code of (@see BlockSender)
* in order to cater to our requirement after analyzing the principle
* behind (@see BlockSender).
*
* @author Deng Changchun
* @version 2
* @since 2014-05-24
*/
public class RaidBlockSender2 implements java.io.Closeable {
static final Log LOG = DataNode.LOG;
static final Log ClientTraceLog = DataNode.ClientTraceLog;
/**
* Minimum buffer used while sending data to clients. Used only if
* transferTo() is enabled. 64KB is not that large. It could be larger, but
* not sure if there will be much more improvement.
*/
private static final int MIN_BUFFER_WITH_TRANSFERTO = 64 * 1024;
private static final int TRANSFERTO_BUFFER_SIZE = Math.max(
HdfsConstants.IO_FILE_BUFFER_SIZE, MIN_BUFFER_WITH_TRANSFERTO);
/** the block to read from */
private final ExtendedBlock block;
/** Stream to read block data from */
private InputStream blockIn;
/** updated while using transferTo() */
private long blockInPosition = -1;
/** Stream to read checksum */
private DataInputStream checksumIn;
/** Checksum utility */
private final DataChecksum checksum;
/** Initial position to read */
private long initialOffset;
/** Current position of read */
private long offset;
/** Position of last byte to read from block file */
private final long endOffset;
/** Number of bytes in chunk used for computing checksum */
private final int chunkSize;
/** Number bytes of checksum computed for a chunk */
private final int checksumSize;
/** If true, failure to read checksum is ignored */
private final boolean corruptChecksumOk;
/** Sequence number of packet being sent */
private long seqno;
/** Set to true if transferTo is allowed for sending data to the client */
private final boolean transferToAllowed;
/** Set to true once entire requested byte range has been sent to the client */
private boolean sentEntireByteRange;
/** When true, verify checksum while reading from checksum file */
private final boolean verifyChecksum;
/** Format used to print client trace log messages */
private final String clientTraceFmt;
private volatile ChunkChecksum lastChunkChecksum = null;
/** The file descriptor of the block being sent */
private FileDescriptor blockInFd;
// Cache-management related fields
private final long readaheadLength;
private ReadaheadRequest curReadahead;
private final boolean alwaysReadahead;
private final boolean dropCacheBehindLargeReads;
private final boolean dropCacheBehindAllReads;
private long lastCacheDropOffset;
@VisibleForTesting
static long CACHE_DROP_INTERVAL_BYTES = 1024 * 1024; // 1MB
/**
* See {
{@link BlockSender#isLongRead()}
*/
private static final long LONG_READ_THRESHOLD_BYTES = 256 * 1024;
/**
* Constructor
*
* @param block
* Block that is being read
* @param startOffset
* starting offset to read from
* @param length
* length of data to read
* @param corruptChecksumOk
* @param verifyChecksum
* verify checksum while reading the data
* @param sendChecksum
* send checksum to client.
* @param datanode
* datanode from which the block is being read
* @param clientTraceFmt
* format string used to print client trace logs
* @throws IOException
*/
public RaidBlockSender2(ExtendedBlock block, long blockLength,
long startOffset,long length,
boolean corruptChecksumOk, boolean verifyChecksum,
boolean sendChecksum, boolean transferToAllowed,
DataInputStream metadataIn, InputStreamFactory streamFactory,
String clientTraceFmt, CachingStrategy cachingStrategy)
throws IOException {
try {
this.block = block;
this.corruptChecksumOk = corruptChecksumOk;
this.verifyChecksum = verifyChecksum;
this.clientTraceFmt = clientTraceFmt;
/*
* If the client asked for the cache to be dropped behind all reads,
* we honor that. Otherwise, we use the DataNode defaults. When
* using DataNode defaults, we use a heuristic where we only drop
* the cache for large reads.
*/
if (cachingStrategy.getDropBehind() == null) {
this.dropCacheBehindAllReads = false;
this.dropCacheBehindLargeReads = DFSConfigKeys.DFS_DATANODE_DROP_CACHE_BEHIND_READS_DEFAULT;
} else {
this.dropCacheBehindAllReads = this.dropCacheBehindLargeReads = cachingStrategy
.getDropBehind().booleanValue();
}
/*
* Similarly, if readahead was explicitly requested, we always do
* it. Otherwise, we read ahead based on the DataNode settings, and
* only when the reads are large.
*/
if (cachingStrategy.getReadahead() == null) {
this.alwaysReadahead = false;
this.readaheadLength = DFSConfigKeys.DFS_DATANODE_READAHEAD_BYTES_DEFAULT;
} else {
this.alwaysReadahead = true;
this.readaheadLength = cachingStrategy.getReadahead()
.longValue();
}
if (verifyChecksum) {
// To simplify implementation, callers may not specify
// verification
// without sending.
Preconditions.checkArgument(sendChecksum,
"If verifying checksum, currently must also send it.");
}
final Replica replica;
final long replicaVisibleLength = blockLength;
;
// transferToFully() fails on 32 bit platforms for block sizes >=
// 2GB,
// use normal transfe