Block、Replica、BlocksMap
Block类
Block类用来唯一标识Namenode中的数据块,是HDFS数据块最基本的抽象接口。
Block类实现了Writable接口,可以进行序列化。Block类还实现了Comparable接口,按照blockid大小排序。
Block类定义了三个字段。
- blockId(唯一标识了这个block对象)
- numBytes(数据块大小,单位是字节)
- generateStamp(数据块的时间戳)
private volatile long blockId;
private volatile long numBytes;
private volatile long generationStamp;
BlockInfo类
BlockInfo类扩展自Block类。是Block类的补充说明。BlockInfo类定义了bc字段保存该数据块归属于哪一个HDFS文件,bc字段是BlockCollection类型的,记录了该HDFS文件的INode对象的引用(INode是BlockCollection的子类)。BlockInfo类还定义了triplets字段保存这个Block的副本存储在哪些数据节点上,triplets是一个Object类型的数组,这个字段非常重要。
BlockManager
blockManager初始化的时候启动了两个线程,pendingReplications和replicationThread,这两个线程共同完成了BlockManager的核心逻辑。
public void activate(Configuration conf) {
pendingReplications.start();
datanodeManager.activate(conf);
this.replicationThread.start();
storageInfoDefragmenterThread.setName("StorageInfoMonitor");
storageInfoDefragmenterThread.start();
long max_lock_hold_ms = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_LOCK_HOLD_IN_BLOCK_REPORT_THREAD_MS_KEY, DFSConfigKeys.DFS_NAMENODE_MAX_LOCK_HOLD_IN_BLOCK_REPORT_THREAD_MS_DEFAULT);
int max_queue_size = conf.getInt(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKING_QUEUE_SIZE_IN_BLOCK_REPORT_THREAD_KEY, DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKING_QUEUE_SIZE_IN_BLOCK_REPORT_THREAD_DEFAULT);
this.blockReportThread = new BlockReportProcessingThread(max_lock_hold_ms, max_queue_size);
this.blockReportThread.start();
this.replicaCacheManager.activate();
mxBeanName = MBeans.register("NameNode", "BlockStats", this);
}
pendingReplications
这个线程会遍历pendingReplications这个集合中保存的所有数据块复制任务,将超过指定时间(5分钟)没有确认的复制请求加入超时队列PendingReplicationBlocks.timedOutItems中。
class PendingReplicationMonitor implements Runnable {
@Override
public void run() {
while (fsRunning) {
long period = DEFAULT_RECHECK_INTERVAL;
try {
pendingReplicationCheck();
zoneServicePendingReplicationCheck();
Thread.sleep(period);
} catch (InterruptedException ie) {
if(LOG.isDebugEnabled()) {
LOG.debug("PendingReplicationMonitor thread is interrupted.", ie);
}
}
}
}
public class PendingReplicationBlocks {
private static final Logger LOG = BlockManager.LOG;
private final Map<BlockInfo, PendingBlockInfo> pendingReplications;
private final Map<BlockInfo, PendingCrossIDCBlockInfo> pendingCrossIDCReplications;
private final List<PendingZoneServiceBlockInfo> pendingZoneServiceBlockInfoList;
private final ArrayList<BlockInfo> timedOutItems;
...
}
replicationThread
这个线程的实现类是BlockManager中的内部类ReplicationMonitor,ReplicationMonitor是一个线程类。ReplicationMonitor线程会周期性地调用computeDatanodeWork()方法触发数据块的复制和删除任务。然后调用processPendingReplictaions()方法将PendingReplicationBlocks.timedOutItems超时队列中保存的超时任务重新加回neededReplications队列中。
private class ReplicationMonitor implements Runnable {
@Override
public void run() {
while (namesystem.isRunning()) {
try {
// Process replication work only when active NN is out of safe mode.
if (namesystem.isPopulatingReplQueues()) {
// 复制和删除数据块
computeDatanodeWork();
// 将超时任务重新添加到neededReplications队列中
processPendingReplications();
rescanPostponedMisreplicatedBlocks();
}
Thread.sleep(replicationRecheckInterval);
} catch (Throwable t) {
if (!namesystem.isRunning()) {
LOG.info("Stopping ReplicationMonitor.");
if (!(t instanceof InterruptedException)) {
LOG.info("ReplicationMonitor received an exception"
+ " while shutting down.", t);
}
break;
} else if (!checkNSRunning && t instanceof InterruptedException) {
LOG.info("Stopping ReplicationMonitor for testing.");
break;
}
LOG.error("ReplicationMonitor thread received Runtime exception. ",
t);
terminate(1, t);
}
}
}
}
computeDatanodeWork()
computeDatanodeWork() 方法执行了两项操作。
- 复制操作:从等待复制的数据块中选出若干个数据块进行复制操作,然后为这些数据块的复制操作选出source源节点和target目标节点,最后构造复制指令并在下次心跳的时候将复制指令带回给源节点以执行复制操作。
- 删除操作:从等待删除的数据块副本中选出若干个副本,然后构造删除指令并在下次心跳的时候将删除指带回给目标节点以执行副本的删除操作。
int computeDatanodeWork() {
// Blocks should not be replicated or removed if in safe mode.
// It's OK to check safe mode here w/o holding lock, in the worst
// case extra replications will be scheduled, and these will get
// fixed up later.
// 处于安全模式下不可以进行复制以及删除操作
if (namesystem.isInSafeMode()) {
return 0;
}
final int numlive = heartbeatManager.getLiveDatanodeCount();
final int blocksToProcess = numlive
* this.blocksReplWorkMultiplier;
final int nodesToProcess = (int) Math.ceil(numlive
* this.blocksInvalidateWorkPct);
// 调用computeReplicationWork()计算出需要进行备份的副本
int workFound = this.computeReplicationWork(blocksToProcess);
// Update counters
namesystem.getBMLock().writeLock(OpName.COMPUTE_DATANODE_WORK);
try {
this.updateState();
this.scheduledReplicationBlocksCount = workFound;
} finally {
namesystem.getBMLock().writeUnlock(OpName.COMPUTE_DATANODE_WORK);
}
// 调用computeReplicationWork()计算出需要进行删除的副本
workFound += this.computeInvalidateWork(nodesToProcess);
return workFound;
}
computeReplicationWork()
选择需要冗余复制的数据块 -> 为数据块选择源节点 -> 为数据块选择目标节点 -> 生成复制指令
int computeReplicationWork(int blocksToProcess) {
List<List<BlockInfo>> blocksToReplicate = null;
namesystem.getBMLock().writeLock(OpName.COMPUTE_REPLICATION_WORK);
try {
// Choose the blocks to be replicated
blocksToReplicate = neededReplications
.chooseUnderReplicatedBlocks(blocksToProcess);
} finally {
namesystem.getBMLock().writeUnlock(OpName.COMPUTE_REPLICATION_WORK);
}
return computeReplicationWorkForBlocks(blocksToReplicate);
}
选择源节点
for (int priority = 0; priority < blocksToReplicate.size(); priority++) {
for (BlockInfo block : blocksToReplicate.get(priority)) {
// block should belong to a file
bc = blocksMap.getBlockCollection(block);
// abandoned block or block reopened for append
if(bc == null) {
neededReplications.remove(block, priority);
// remove from neededReplications
continue;
}
synchronized (bc) {
if (blocksMap.getBlockCollection(block) == null || (bc.isUnderConstruction() && block.equals(bc.getLastBlock()))) {
neededReplications.remove(block, priority);
// remove from neededReplications
continue;
}
// 数据块的副本系数
requiredReplication = bc.getBlockReplication();
// get a source data-node
containingNodes = new ArrayList<DatanodeDescriptor>();
List<DatanodeStorageInfo> liveReplicaNodes = new ArrayList<DatanodeStorageInfo>();
Map<String, DatanodeDescriptor> idcSourceDatanodeMap = new HashMap<>();
NumberReplicas numReplicas = new NumberReplicas();
srcNode = chooseSourceDatanode(
block, containingNodes, liveReplicaNodes, idcSourceDatanodeMap, numReplicas,
priority);
if(srcNode == null) {
// block can not be replicated from any node
LOG.debug("Block " + block + " cannot be repl from any node");
continue;
}
// liveReplicaNodes can include READ_ONLY_SHARED replicas which are
// not included in the numReplicas.liveReplicas() count
assert liveReplicaNodes.size() >= numReplicas.liveReplicas();
// do not schedule more if enough replicas is already pending
numEffectiveReplicas = numReplicas.liveReplicas() +
pendingReplications.getNumReplicas(block);
if (numEffectiveReplicas >= requiredReplication) {
if ( (pendingReplications.getNumReplicas(block) > 0) ||
(isPlacementPolicySatisfied(block)) ) {
neededReplications.remove(block, priority);
// remove from neededReplications
blockLog.info("BLOCK* Removing {} from neededReplications as" +
" it has enough replicas", block);
continue;
}
}
// 如果副本数不足,则需要计算添加多少个副本
if (numReplicas.liveReplicas() < requiredReplication) {
additionalReplRequired = requiredReplication
- numEffectiveReplicas;
} else {
additionalReplRequired = 1;
// Needed on a new rack
}
Boolean isWithinCold = false;
if (fsNamesystem != null && bc != null && (bc instanceof INodeFile)) {
isWithinCold = fsNamesystem.getFSDirectory().isInColdStorage((INodeFile) bc);
}
if (!isWithinCold && datanodeManager.isColdStorageDataNode(srcNode)) {
isWithinCold = true;
}
// 在工作队列中添加备份任务,任务会在下一次心跳时带到Datanode。
work.add(new ReplicationWork(block, bc, srcNode,
containingNodes, liveReplicaNodes, idcSourceDatanodeMap,
additionalReplRequired, priority, isWithinCold));
}
}
}
选择目标节点
final Set<Node> excludedNodes = new HashSet<Node>();
for (ReplicationWork rw : work){
// Exclude all of the containing nodes from being targets.
// This list includes decommissioning or corrupt nodes.
excludedNodes.clear();
for (DatanodeDescriptor dn : rw.containingNodes) {
excludedNodes.add(dn);
}
if (rw.isWithinCold) {
blockplacement.setColdStorageNetworkTopology(datanodeManager.getColdStorageNetworkTopology());
} else {
blockplacement.removeColdStorageNetworkTopology();
}
// choose replication targets: NOT HOLDING THE GLOBAL LOCK
// It is costly to extract the filename for which chooseTargets is called,
// so for now we pass in the block collection itself.
rw.chooseTargets(blockplacement, storagePolicySuite, excludedNodes);
blockplacement.removeColdStorageNetworkTopology();
}
进行复制操作
// 将副本加入Datanode的备份队列中,在下一次心跳时,发出备份指令
rw.srcNode.addBlockToBeReplicated(block, targets);
// remove from neededReplications
if(numEffectiveReplicas + targets.length >= requiredReplication) {
neededReplications.remove(block, priority);
// remove from neededReplications
}
// Move the block-replication into a "pending" state.
// The reason we use 'pending' is so we can retry
// replications that fail after an appropriate amount of time.
pendingReplications.increment(block, targets);
scheduledWork++;