Nanode启动时会启动一个心跳检测线程HeartbeatMonitor,该线程会周期性的检测数据节点的状态,如果数据节点在指定时间内未发送心跳信号,则认为该节点已死,namenode会进行块复制以维护文件的副本数。
class HeartbeatMonitor implements Runnable {
private long lastHeartbeatCheck;
private long lastAccessKeyUpdate;
public void run() {
while (fsRunning) {
try {
long now = now();
//注意检测规则,如果上次检测时间加上检测间隔小于当前时间,则进行检测
//默认间隔为300000也就是5分钟
if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
heartbeatCheck();
lastHeartbeatCheck = now;
}
if (isAccessTokenEnabled && (lastAccessKeyUpdate +
accessKeyUpdateInterval < now)) {
updateAccessKey();
lastAccessKeyUpdate = now;
}
} catch (Exception e) {
FSNamesystem.LOG.error(StringUtils.stringifyException(e));
}
try {
Thread.sleep(5000); // 每次检测完休息5秒钟
} catch (InterruptedException ie) {
}
}
}
}
下面看下heartbeatCheck的检测流程:
void heartbeatCheck() {
//如果是安全模式,直接返回
if (isInSafeMode()) {
// not to check dead nodes if in safemode
return;
}
boolean allAlive = false;
while (!allAlive) {
boolean foundDead = false;
DatanodeID nodeID = null;
// locate the first dead node.
synchronized(heartbeats) {
for (Iterator<DatanodeDescriptor> it = heartbeats.iterator();
it.hasNext();) {
DatanodeDescriptor nodeInfo =it.next();
//检测规则为当前时间减去检测间隔,如果大于上次检测时间,则宣布死亡,一次只删除一个
//死亡节点
if (isDatanodeDead(nodeInfo)) {
foundDead = true;
nodeID = nodeInfo;
break;
}
}
}
// 在移除死亡节点时会锁住整个fsnamesystem
if (foundDead) {
synchronized (this) {
synchronized(heartbeats) {
synchronized (datanodeMap) {
DatanodeDescriptor nodeInfo = null;
try {
nodeInfo = getDatanode(nodeID);
} catch (IOException e) {
nodeInfo = null;
}
if (nodeInfo != null && isDatanodeDead(nodeInfo)) {
NameNode.stateChangeLog.info("BLOCK* NameSystem.heartbeatCheck:
"+ "lost heartbeat from " + nodeInfo.getName());
//将该节点在heartbeats中删除,heartbeats为ArrayList
removeDatanode(nodeInfo);
}
}
}
}
}
allAlive = !foundDead;
}
}