程序入口
- Hadoop Branch : Hadoop-2.6.0
- 使用样例 :
hadoop dfs -checksum /tmp/README.txt
- 结果
/tmp/README.txt
MD5-of-0MD5-of-512CRC32C
00000200000000000000000017970719be16d1071635fa381b95f957
算法说明:
“MD5-of-” + crcPerBlock + “MD5-of-” + bytesPerCRC +
getCrcType().name()
crcPerBlock : 这个没想明白,只有对应的文件切分的block个数大于1时才会有值,否则为0
bytesPerCRC : 每512个byte使用CRC校验
CRC32C : CRC32的校验算法
结果说明:
00000200 00000000 00000000 17970719 be16d107 1635fa38 1b95f957
结果为28个byte,
bytesPerCRC 为int类型,使用前4个byte,值为512;
crcPerBlock为Long类型,使用中间8个byte,值为0;
MD5计算结果为最后16个byte
// TODO CRC32 校验算法
HDFS 物理存储格式
-rw-r–r-- 1 wankun wheel 1366 8 3 11:52 blk_1073742101
-rw-r–r-- 1 wankun wheel 19 8 3 11:52 blk_1073742101_1277.meta
meta文件的header信息占用7个字节,剩余3个chunk checkSum,每个占用4个byte,对应于block文件的每512个字节一个checksum.
- 对应程序入口:
org.apache.hadoop.fs.FsShell
FsShell.main() -> run(),cmd为需要执行的命令,根据cmd在commandFactory找到对应要执行的instance,执行instance.run()方法。commandFactory 在FsShell.init()通过反射注册入FsCommand
中的命令。
FsCommand
public static void registerCommands(CommandFactory factory) {
factory.registerCommands(AclCommands.class);
factory.registerCommands(CopyCommands.class);
factory.registerCommands(Count.class);
factory.registerCommands(Delete.class);
factory.registerCommands(Display.class);
factory.registerCommands(Find.class);
factory.registerCommands(FsShellPermissions.class);
factory.registerCommands(FsUsage.class);
factory.registerCommands(Ls.class);
factory.registerCommands(Mkdir.class);
factory.registerCommands(MoveCommands.class);
factory.registerCommands(SetReplication.class);
factory.registerCommands(Stat.class);
factory.registerCommands(Tail.class);
factory.registerCommands(Test.class);
factory.registerCommands(Touch.class);
factory.registerCommands(SnapshotCommands.class);
factory.registerCommands(XAttrCommands.class);
}
class Display extends FsCommand {
public static void registerCommands(CommandFactory factory) {
factory.addClass(Cat.class, "-cat");
factory.addClass(Text.class, "-text");
factory.addClass(Checksum.class, "-checksum");
}
}
Checksum -> DistributedFileSystem.getFileChecksum() -> DFSClient.getFileChecksum()
checksum主逻辑
获取对应文件的所有block信息,连接到对应的DN,计算block的MD5校验信息,根据所有block的汇总byte值进行二次MD5计算,最后
文件对应的locatedblocks列表,依次计算该Block MD5值
public MD5MD5CRC32FileChecksum getFileChecksum(String src, long length)
throws IOException {
checkOpen();
Preconditions.checkArgument(length >= 0);
//get block locations for the file range
LocatedBlocks blockLocations = callGetBlockLocations(namenode, src, 0,
length);
if (null == blockLocations) {
throw new FileNotFoundException("File does not exist: " + src);
}
List<LocatedBlock> locatedblocks = blockLocations.getLocatedBlocks();
final DataOutputBuffer md5out = new DataOutputBuffer();
int bytesPerCRC = -1;
DataChecksum.Type crcType = DataChecksum.Type.DEFAULT;
long crcPerBlock = 0;
boolean refetchBlocks = false;
int lastRetriedIndex = -1;
// get block checksum for each block
long remaining = length;
if (src.contains(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR)) {
remaining = Math.min(length, blockLocations.getFileLength());
}
// 文件对应的locatedblocks列表,依次计算该Block MD5值
for(int i = 0; i < locatedblocks.size() && remaining > 0; i++) {
if (refetchBlocks) { // refetch to get fresh tokens
blockLocations = callGetBlockLocations(namenode, src, 0, length);
if (null == blockLocations) {
throw new FileNotFoundException("File does not exist: " + src);
}
locatedblocks = blockLocations.getLocatedBlocks();
refetchBlocks = false;
}
LocatedBlock lb = locatedblocks.get(i);
final ExtendedBlock block = lb.getBlock();
if (remaining < block.getNumBytes()) {
block.setNumBytes(remaining);
}
remaining -= block.getNumBytes();
final DatanodeInfo[] datanodes = lb.getLocations();
//try each datanode location of the block
final int timeout = 3000 * datanodes.length + dfsClientConf.socketTimeout;
boolean done = false;
// connectToDN并从DN获取计算好的MD5值
for(int j = 0; !done && j < datanodes.length; j++) {
DataOutputStream out = null;
DataInputStream in = null;
try {
//connect to a datanode
IOStreamPair pair = connectToDN(datanodes[j], timeout, lb);
out = new DataOutputStream(new BufferedOutputStream(pair.out,
HdfsConstants.SMALL_BUFFER_SIZE));
in = new DataInputStream(pair.in);
if (LOG.isDebugEnabled()) {
LOG.debug("write to " + datanodes[j] + ": "
+ Op.BLOCK_CHECKSUM + ", block=" + block);
}
// get block MD5
// Sender 负责向out输出流中发送Op操作类型和proto参数(Op.BLOCK_CHECKSUM, proto)
// Receiver 负责接收Op操作,调用 DataXceiver 进行处理
new Sender(out).blockChecksum(block, lb.getBlockToken());
final BlockOpResponseProto reply =
BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in));
if (reply.getStatus() != Status.SUCCESS) {
if (reply.getStatus() == Status.ERROR_ACCESS_TOKEN) {
throw new InvalidBlockTokenException();
} else {
throw new IOException("Bad response " + reply + " for block "
+ block + " from datanode " + datanodes[j]);
}
}
OpBlockChecksumResponseProto checksumData =
reply.getChecksumResponse();
//read byte-per-checksum
final int bpc = checksumData.getBytesPerCrc();
if (i == 0) { //first block
bytesPerCRC = bpc;
}
else if (bpc != bytesPerCRC) {
throw new IOException("Byte-per-checksum not matched: bpc=" + bpc
+ " but bytesPerCRC=" + bytesPerCRC);
}
//read crc-per-block
final long cpb = checksumData.getCrcPerBlock();
if (locatedblocks.size() > 1 && i == 0) {
crcPerBlock = cpb;
}
//read md5
final MD5Hash md5 = new MD5Hash(
checksumData.getMd5().toByteArray());
md5.write(md5out);
// read crc-type
final DataChecksum.Type ct;
if (checksumData.hasCrcType()) {
ct = PBHelper.convert(checksumData
.getCrcType());
} else {
LOG.debug("Retrieving checksum from an earlier-version DataNode: " +
"inferring checksum by reading first byte");
ct = inferChecksumTypeByReading(lb, datanodes[j]);
}
if (i == 0) { // first block
crcType = ct;
} else if (crcType != DataChecksum.Type.MIXED
&& crcType != ct) {
// if crc types are mixed in a file
crcType = DataChecksum.Type.MIXED;
}
done = true;
if (LOG.isDebugEnabled()) {
if (i == 0) {
LOG.debug("set bytesPerCRC=" + bytesPerCRC
+ ", crcPerBlock=" + crcPerBlock);
}
LOG.debug("got reply from " + datanodes[j] + ": md5=" + md5);
}
} catch (InvalidBlockTokenException ibte) {
if (i > lastRetriedIndex) {
if (LOG.isDebugEnabled()) {
LOG.debug("Got access token error in response to OP_BLOCK_CHECKSUM "
+ "for file " + src + " for block " + block
+ " from datanode " + datanodes[j]
+ ". Will retry the block once.");
}
lastRetriedIndex = i;
done = true; // actually it's not done; but we'll retry
i--; // repeat at i-th block
refetchBlocks = true;
break;
}
} catch (IOException ie) {
LOG.warn("src=" + src + ", datanodes["+j+"]=" + datanodes[j], ie);
} finally {
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
if (!done) {
throw new IOException("Fail to get block MD5 for " + block);
}
}
//compute file MD5
// MD5加密:根据每个Block MD5值的数组data序列字节更新摘要,生产二次MD5值
final MD5Hash fileMD5 = MD5Hash.digest(md5out.getData());
switch (crcType) {
case CRC32:
return new MD5MD5CRC32GzipFileChecksum(bytesPerCRC,
crcPerBlock, fileMD5);
case CRC32C:
// CRC32C 一种基于Intel硬件指令,加算计算的CRC32算法
// https://issues.apache.org/jira/browse/HADOOP-7443
return new MD5MD5CRC32CastagnoliFileChecksum(bytesPerCRC,
crcPerBlock, fileMD5);
default:
// If there is no block allocated for the file,
// return one with the magic entry that matches what previous
// hdfs versions return.
if (locatedblocks.size() == 0) {
return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5);
}
// we should never get here since the validity was checked
// when getCrcType() was called above.
return null;
}
}
DataXceiver 计算Block Checksum
读取Block对应的meta文件内容,并计算其MD5值
- 如果是整个Block校验,直接使用全部meta文件内容计算其MD5值
- 如果是部分Block校验,
@Override
public void blockChecksum(final ExtendedBlock block,
final Token<BlockTokenIdentifier> blockToken) throws IOException {
final DataOutputStream out = new DataOutputStream(
getOutputStream());
checkAccess(out, true, block, blockToken,
Op.BLOCK_CHECKSUM, BlockTokenSecretManager.AccessMode.READ);
// client side now can specify a range of the block for checksum
long requestLength = block.getNumBytes();
Preconditions.checkArgument(requestLength >= 0);
long visibleLength = datanode.data.getReplicaVisibleLength(block);
boolean partialBlk = requestLength < visibleLength;
updateCurrentThreadName("Reading metadata for block " + block);
// 每一个Block都有一个对应的meta文件,metadataIn是meta文件的内容
// eg. blk_1073742101 blk_1073742101_1277.meta
final LengthInputStream metadataIn = datanode.data
.getMetaDataInputStream(block);
final DataInputStream checksumIn = new DataInputStream(
new BufferedInputStream(metadataIn, HdfsConstants.IO_FILE_BUFFER_SIZE));
updateCurrentThreadName("Getting checksum for block " + block);
try {
//read metadata file
// header content:
// short version;
// byte type; // CRC32 or CRC32C 这两种类型ChecksumSize = 4
// int bpc; // bytePerCRC 应该是默认512,即每512个byte进行校验,校验结果占用4个byte
final BlockMetadataHeader header = BlockMetadataHeader
.readHeader(checksumIn);
final DataChecksum checksum = header.getChecksum();
final int csize = checksum.getChecksumSize();
final int bytesPerCRC = checksum.getBytesPerChecksum();
// metadata的大小(去除header 2+1+4 个byte)/ 校验chunk checkSum(4)的大小,得到chunk checkSum个数
final long crcPerBlock = csize <= 0 ? 0 :
(metadataIn.getLength() - BlockMetadataHeader.getHeaderSize()) / csize;
// 如果是部分Block校验,【--计算部分Block内容的MD5值--】
// 整个Block校验,直接使用MD5加密meta内容
final MD5Hash md5 = partialBlk && crcPerBlock > 0 ?
calcPartialBlockChecksum(block, requestLength, checksum, checksumIn)
: MD5Hash.digest(checksumIn);
if (LOG.isDebugEnabled()) {
LOG.debug("block=" + block + ", bytesPerCRC=" + bytesPerCRC
+ ", crcPerBlock=" + crcPerBlock + ", md5=" + md5);
}
//write reply
BlockOpResponseProto.newBuilder()
.setStatus(SUCCESS)
.setChecksumResponse(OpBlockChecksumResponseProto.newBuilder()
.setBytesPerCrc(bytesPerCRC)
.setCrcPerBlock(crcPerBlock)
.setMd5(ByteString.copyFrom(md5.getDigest()))
.setCrcType(PBHelper.convert(checksum.getChecksumType())))
.build()
.writeDelimitedTo(out);
out.flush();
} catch (IOException ioe) {
LOG.info("blockChecksum " + block + " received exception " + ioe);
incrDatanodeNetworkErrors();
throw ioe;
} finally {
IOUtils.closeStream(out);
IOUtils.closeStream(checksumIn);
IOUtils.closeStream(metadataIn);
}
//update metrics
datanode.metrics.addBlockChecksumOp(elapsed());
}