本文将结合hadoop2.7.0版本的源码与UML图对DataNode的初始化流程进行深入剖析,旨在更深入地理解DataNode初始化的整体逻辑
第一步:查看DataNode的入口方法main()
public static void main(String args[]) {
if (DFSUtil.parseHelpArgument(args, DataNode.USAGE, System.out, true)) {
System.exit(0);
}
//TODO:调用
secureMain(args, null);
}
第二、三步:调用DataNode的createDataNode()方法
public static void secureMain(String args[], SecureResources resources) {
int errorCode = 0;
try {
StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
//TODO:初始化DataNode
DataNode datanode = createDataNode(args, null, resources);
if (datanode != null) {
//TODO:阻塞
datanode.join();
} else {
errorCode = 1;
}
...
}
第四步:调用instantiateDataNode()方法
public static DataNode createDataNode(String args[], Configuration conf,
SecureResources resources) throws IOException {
//TODO:实例化DataNode
DataNode dn = instantiateDataNode(args, conf, resources);
if (dn != null) {
//TODO:启动DataNode后台线程
dn.runDatanodeDaemon();
}
return dn;
}
第五步:调用makeInstance()方法
public static DataNode instantiateDataNode(String args [], Configuration conf,
SecureResources resources) throws IOException {
...
//TODO:关键代码
return makeInstance(dataLocations, conf, resources);
}
第六步:通过new实例化DataNode
static DataNode makeInstance(Collection<StorageLocation> dataDirs,
Configuration conf, SecureResources resources) throws IOException {
...
//TODO:实例化DataNode
return new DataNode(conf, locations, resources);
}
第七步:在DataNode构造方法内调用startDataNode()启动DataNode
DataNode(final Configuration conf,
final List<StorageLocation> dataDirs,
final SecureResources resources) throws IOException {
...
try {
hostName = getHostName(conf);
LOG.info("Configured hostname is " + hostName);
//TODO:启动DataNode
startDataNode(conf, dataDirs, resources);
} catch (IOException ie) {
shutdown();
throw ie;
}
...
}
第八步:在DataNode的startDataNode()方法内主要对5个重要组件进行初始化,分别为DataXceiver、HttpServer、RpcServer、BlockPoolManager、心跳机制
void startDataNode(Configuration conf,
List<StorageLocation> dataDirs,
SecureResources resources
) throws IOException {
...
storage = new DataStorage();
// global DN settings
registerMXBean();
//TODO:初始化DataXceiver
initDataXceiver(conf);
//TODO:启动HttpServer服务
startInfoServer(conf);
pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();
// BlockPoolTokenSecretManager is required to create ipc server.
this.blockPoolTokenSecretManager = new BlockPoolTokenSecretManager();
// Login is done by now. Set the DN user name.
dnUserName = UserGroupInformation.getCurrentUser().getShortUserName();
LOG.info("dnUserName = " + dnUserName);
LOG.info("supergroup = " + supergroup);
//TODO:启动RPC
initIpcServer(conf);
metrics = DataNodeMetrics.create(conf, getDisplayName());
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
//TODO:创建BlockPoolManager
//BlockPool:一个集群就有一个BlockPool
//如果使用的是联邦机制,就会有多个NameNode,也就会有多个BlockPool,一个联邦就是一个BlockPool
//联邦一:hadoop1(Active) hadoop2(StandBy)(BlockPool是同一个)
//联邦二:hadoop3(Active) hadoop4(StandBy)(BlockPool是同一个)
blockPoolManager = new BlockPoolManager(this);
//TODO:涉及心跳机制
blockPoolManager.refreshNamenodes(conf);
...
}
第九步:先查看initDataXceiver()如何初始化DataXceiver,方法中主要是实例化DataXceiverServer,该类实现了Runnable接口
private void initDataXceiver(Configuration conf) throws IOException {
///
//TODO:实例化一个DataXceiverServer
//TODO:该类是DataNode用来接收客户端与其它DataNode传过来数据的服务
xserver = new DataXceiverServer(tcpPeerServer, conf, this);
//设置为守护线程
this.dataXceiverServer = new Daemon(threadGroup, xserver);
this.threadGroup.setDaemon(true); // auto destroy when empty
...
}
第十、十一、十二步:调用DataNode的startInfoServer()方法启动HttpServer2服务,方法内实例化了HttpServer2并启动,提供了访问的该DataNode的http请求
private void startInfoServer(Configuration conf)
throws IOException {
...
//TODO:实例化HttpServer2,用于接收http请求
HttpServer2.Builder builder = new HttpServer2.Builder()
.setName("datanode")
.setConf(conf).setACL(new AccessControlList(conf.get(DFS_ADMIN, " ")))
.addEndpoint(URI.create("http://localhost:0"))
.setFindPort(true);
this.infoServer = builder.build();
//TODO:往该httpServlet绑定多个servlet
this.infoServer.addInternalServlet(null, "/streamFile/*", StreamFile.class);
this.infoServer.addInternalServlet(null, "/getFileChecksum/*",
FileChecksumServlets.GetServlet.class);
this.infoServer.setAttribute("datanode", this);
this.infoServer.setAttribute(JspHelper.CURRENT_CONF, conf);
this.infoServer.addServlet(null, "/blockScannerReport",
BlockScanner.Servlet.class);
//TODO:启动http服务
this.infoServer.start();
...
}
第十三、十四步:调用DataNode的initIpcServer()方法启动RPC服务,方法主要是实例化ipcServer
private void initIpcServer(Configuration conf) throws IOException {
InetSocketAddress ipcAddr = NetUtils.createSocketAddr(
conf.getTrimmed(DFS_DATANODE_IPC_ADDRESS_KEY));
...
//TODO:创建一个RPC的服务端
ipcServer = new RPC.Builder(conf)
.setProtocol(ClientDatanodeProtocolPB.class)
.setInstance(service)
.setBindAddress(ipcAddr.getHostName())
.setPort(ipcAddr.getPort())
.setNumHandlers(
conf.getInt(DFS_DATANODE_HANDLER_COUNT_KEY,
DFS_DATANODE_HANDLER_COUNT_DEFAULT)).setVerbose(false)
.setSecretManager(blockPoolTokenSecretManager).build();
...
}
第十五步:通过new实例化BlockPoolManager,
//TODO:创建BlockPoolManager
//BlockPool:一个集群就有一个BlockPool
//如果使用的是联邦机制,就会有多个NameNode,也就会有多个BlockPool,一个联邦就是一个BlockPool
//联邦一:hadoop1(Active) hadoop2(StandBy)(BlockPool是同一个)
//联邦二:hadoop3(Active) hadoop4(StandBy)(BlockPool是同一个)
blockPoolManager = new BlockPoolManager(this);
//TODO:涉及心跳机制
blockPoolManager.refreshNamenodes(conf);
第十六步:调用BlockPoolManager的refreshNamenodes()方法,该方法主要是往NameNode注册NameNode以及保持心跳机制
void refreshNamenodes(Configuration conf)
throws IOException {
...
synchronized (refreshNamenodesLock) {
//TODO:
doRefreshNamenodes(newAddressMap);
}
}
第十七、十八步:遍历所有的nameservices,调用startAll()方法启动服务
private void doRefreshNamenodes(
Map<String, Map<String, InetSocketAddress>> addrMap) throws IOException {
assert Thread.holdsLock(refreshNamenodesLock);
Set<String> toRefresh = Sets.newLinkedHashSet();
Set<String> toAdd = Sets.newLinkedHashSet();
Set<String> toRemove;
synchronized (this) {
// Step 1. For each of the new nameservices, figure out whether
// it's an update of the set of NNs for an existing NS,
// or an entirely new nameservice.
/**
* 如果在联邦机制下,会有多个nameservice
*/
for (String nameserviceId : addrMap.keySet()) {
if (bpByNameserviceId.containsKey(nameserviceId)) {
toRefresh.add(nameserviceId);
} else {
toAdd.add(nameserviceId);
}
}
// Step 2. Any nameservices we currently have but are no longer present
// need to be removed.
toRemove = Sets.newHashSet(Sets.difference(
bpByNameserviceId.keySet(), addrMap.keySet()));
assert toRefresh.size() + toAdd.size() ==
addrMap.size() :
"toAdd: " + Joiner.on(",").useForNull("<default>").join(toAdd) +
" toRemove: " + Joiner.on(",").useForNull("<default>").join(toRemove) +
" toRefresh: " + Joiner.on(",").useForNull("<default>").join(toRefresh);
// Step 3. Start new nameservices
if (!toAdd.isEmpty()) {
LOG.info("Starting BPOfferServices for nameservices: " +
Joiner.on(",").useForNull("<default>").join(toAdd));
//TODO:遍历所有的联邦,一个联邦里面会有两个NameNode(HA)
for (String nsToAdd : toAdd) {
ArrayList<InetSocketAddress> addrs =
Lists.newArrayList(addrMap.get(nsToAdd).values());
//TODO:一个联邦对应一个BPOfferService
//一个联邦里面的一个NameNode就是一个BPServiceActor
//即HA下一个BPOfferService对应两个BPServiceActor
BPOfferService bpos = createBPOS(addrs);
bpByNameserviceId.put(nsToAdd, bpos);
offerServices.add(bpos);
}
}
//TODO:启动服务
startAll();
}
第十九、二十步:在BlockPoolManager中遍历所有的BPOfferService,并分别调用其start()方法
synchronized void startAll() throws IOException {
try {
UserGroupInformation.getLoginUser().doAs(
new PrivilegedExceptionAction<Object>() {
@Override
public Object run() throws Exception {
//TODO:遍历所有的BPOfferService 遍历所有的联邦
for (BPOfferService bpos : offerServices) {
//TODO:
bpos.start();
}
return null;
}
});
...
}
第二十一步:遍历bpOfferService中的所有BPServiceActor,并调用其start()方法
void start() {
//TODO:一个bpOfferService里面就会有多个Actor
for (BPServiceActor actor : bpServices) {
//TODO:DataNode进行注册和心跳
actor.start();
}
}
void start() {
if ((bpThread != null) && (bpThread.isAlive())) {
//Thread is started already
return;
}
bpThread = new Thread(this, formatThreadName());
//TODO:设置为守护线程
bpThread.setDaemon(true); // needed for JUnit testing
//TODO:启动线程,即调用线程的run()方法
bpThread.start();
}
第二十二步:由于BPServiceActor实现了Runnable接口,调用期start()方法,最后会执行其run()方法。方法内主要涉及两个重要的方法调用,分别connectToNNAndHandshake()向NameNode注册、offerService()向NameNode发送心跳
public void run() {
LOG.info(this + " starting to offer service");
try {
while (true) {
// init stuff
try {
// setup storage
//TODO:注册核心代码
connectToNNAndHandshake();
break;
} catch (IOException ioe) {
// Initial handshake, storage recovery or registration failed
runningState = RunningState.INIT_FAILED;
if (shouldRetryInit()) {
// Retry until all namenode's of BPOS failed initialization
LOG.error("Initialization failed for " + this + " "
+ ioe.getLocalizedMessage());
//TODO:睡眠5秒
sleepAndLogInterrupts(5000, "initializing");
} else {
runningState = RunningState.FAILED;
LOG.fatal("Initialization failed for " + this + ". Exiting. ", ioe);
return;
}
}
}
runningState = RunningState.RUNNING;
while (shouldRun()) {
try {
//TODO:发送心跳
offerService();
} catch (Exception ex) {
LOG.error("Exception in BPOfferService for " + this, ex);
sleepAndLogInterrupts(5000, "offering service");
}
}
...
}
第二十三步:在connectToNNAndHandshake()方法内主要做了两件事,一是获取到NameNode的代理对象,二是向NameNode注册
private void connectToNNAndHandshake() throws IOException {
// get NN proxy
//TODO:获取到namenode的代理
bpNamenode = dn.connectToNN(nnAddr);
// First phase of the handshake with NN - get the namespace
// info.
NamespaceInfo nsInfo = retrieveNamespaceInfo();
// Verify that this matches the other NN in this HA pair.
// This also initializes our block pool in the DN if we are
// the first NN connection for this BP.
bpos.verifyAndSetNamespaceInfo(nsInfo);
// Second phase of the handshake with the NN.
//TODO:注册
register(nsInfo);
}
第二十四步:在register()在首先创建注册信息bpRegistration,然后通过RPC调用服务端NameNodeRpcServer的registerDatanode()方法
void register(NamespaceInfo nsInfo) throws IOException {
// The handshake() phase loaded the block pool storage
// off disk - so update the bpRegistration object from that info
//TODO:创建注册信息
bpRegistration = bpos.createRegistration();
LOG.info(this + " beginning handshake with NN");
while (shouldRun()) {
try {
// Use returned registration from namenode with updated fields
//TODO:调用服务端的registerDatanode方法进行注册
//注意:根据RPC的调用原则,这里其实调用的是服务端NameNodeRpcServer的registerDatanode()方法
bpRegistration = bpNamenode.registerDatanode(bpRegistration);
//如果执行到这里,说明注册过程已经完成了
bpRegistration.setNamespaceInfo(nsInfo);
break;
...
}
第二十五步:查看NameNodeRpcServer的registerDatanode()方法,其内部调用了FSNamesystem.registerDatanode()
public DatanodeRegistration registerDatanode(DatanodeRegistration nodeReg)
throws IOException {
checkNNStartup();
verifySoftwareVersion(nodeReg);
//TODO:注册DataNode
namesystem.registerDatanode(nodeReg);
return nodeReg;
}
第二十六步:调用了DatanodeManager.registerDatanode()方法
void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
writeLock();
try {
//TODO:DataNodeManager处理关于DataNode的事
getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
checkSafeMode();
} finally {
writeUnlock();
}
}
第二十七步:registerDatanode()主要调用了addDatanode()注册DataNode,并调用heartbeatManager.addDatanode()将注册完成的DataNode加入HeartbeatManger里面进行心跳管理
public void registerDatanode(DatanodeRegistration nodeReg)
throws DisallowedDatanodeException, UnresolvedTopologyException {
InetAddress dnAddress = Server.getRemoteIp();
....
// register new datanode
//TODO:注册DataNode
addDatanode(nodeDescr);
// also treat the registration message as a heartbeat
// no need to update its timestamp
// because its is done when the descriptor is created
//TODO:把注册上来的DataNode加入到HeartbeatManager里面
//后面进行心跳管理
heartbeatManager.addDatanode(nodeDescr);
incrementVersionCount(nodeReg.getSoftwareVersion());
startDecommissioningIfExcluded(nodeDescr);
success = true;
....
}
第二十八步:在addDatanode()中分别往各数据结构中添加datanode相关的信息
void addDatanode(final DatanodeDescriptor node) {
// To keep host2DatanodeMap consistent with datanodeMap,
// remove from host2DatanodeMap the datanodeDescriptor removed
// from datanodeMap before adding node to host2DatanodeMap.
synchronized(datanodeMap) {
//TODO:同一个datanodeMap里面添加数据
host2DatanodeMap.remove(datanodeMap.put(node.getDatanodeUuid(), node));
}
//TODO:往拓朴的数据结构里面加入一条数据
networktopology.add(node); // may throw InvalidTopologyException
//TODO:往内存里面加入一条数据
host2DatanodeMap.add(node);
checkIfClusterIsNowMultiRack(node);
if (LOG.isDebugEnabled()) {
LOG.debug(getClass().getSimpleName() + ".addDatanode: "
+ "node " + node + " is added to datanodeMap.");
}
}
第二十九步:重新加到BPServiceActor.run()方法里面,查看offerService()方法,每3秒通过sendHeartBeat()发送一次心跳
private void offerService() throws Exception {
....
//TODO:周期性
while (shouldRun()) {
try {
final long startTime = monotonicNow();
//TODO:心跳是每3秒进行一次
if (startTime - lastHeartbeat >= dnConf.heartBeatInterval) {
lastHeartbeat = startTime;
if (!dn.areHeartbeatsDisabledForTests()) {
//NameNode是不直接与DataNode进行连接的
//DataNode发送心跳给NameNode
//NameNode接收到心跳以后,会返回一些指令
//DataNode接收到这些指令以后,根据这些指令做对应的操作
//TODO:发送心跳,返回来的是NameNode给的响应指令
HeartbeatResponse resp = sendHeartBeat();
....
} // offerService
第三十步:在sendHeartBeat()方法内通过RPC调用NameNodeRpcServer的sendHeartbeat()方法向NameNode发送心跳
HeartbeatResponse sendHeartBeat() throws IOException {
...
//TODO:获取NameNode的代理,发送心跳
//根据RPC调用原则,其实调用的NameNodeRpcServer的sendHeartbeat()方法
return bpNamenode.sendHeartbeat(bpRegistration,
reports,
dn.getFSDataset().getCacheCapacity(),
dn.getFSDataset().getCacheUsed(),
dn.getXmitsInProgress(),
dn.getXceiverCount(),
numFailedVolumes,
volumeFailureSummary);
}
第三十一步:在NameNodeRpcServer的sendHeartbeat()方法内调用FSNamesystem.handleHeartbeart()处理DataNode发送过来的心跳
public HeartbeatResponse sendHeartbeat(DatanodeRegistration nodeReg,
StorageReport[] report, long dnCacheCapacity, long dnCacheUsed,
int xmitsInProgress, int xceiverCount,
int failedVolumes, VolumeFailureSummary volumeFailureSummary)
throws IOException {
checkNNStartup();
verifyRequest(nodeReg);
//TODO:处理DataNode发送过来的心跳
return namesystem.handleHeartbeat(nodeReg, report,
dnCacheCapacity, dnCacheUsed, xceiverCount, xmitsInProgress,
failedVolumes, volumeFailureSummary);
}
第三十二步:通过调用DatanodeManager.handleHeartbeat()方法将DataNode发送过来的心跳转化为DatanodeCommand
HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
StorageReport[] reports, long cacheCapacity, long cacheUsed,
int xceiverCount, int xmitsInProgress, int failedVolumes,
VolumeFailureSummary volumeFailureSummary) throws IOException {
...
//TODO:NameNode处理DataNode发送过来的心跳
DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary);
//create ha status
final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
haContext.getState().getServiceState(),
getFSImage().getLastAppliedOrWrittenTxId());
//TODO:给DataNode返回响应
return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
...
}
第三十三步:DatanodeManager.handleHeartbeat()中首先获取已有的DataNode信息,然后调用HeartbeatManager.updateHeartbeat()方法更新心跳信息
public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
StorageReport[] reports, final String blockPoolId,
long cacheCapacity, long cacheUsed, int xceiverCount,
int maxTransfers, int failedVolumes,
VolumeFailureSummary volumeFailureSummary) throws IOException {
synchronized (heartbeatManager) {
synchronized (datanodeMap) {
DatanodeDescriptor nodeinfo = null;
try {
//TODO:问问已有datanodeMap里面获取到注册过来的DataNode信息
//如果能获取到这个datanode的信息说明以前就注册过了
//但是如果是第一次,那datanodeMap里面是没有信息的
nodeinfo = getDatanode(nodeReg);
} catch(UnregisteredNodeException e) {
return new DatanodeCommand[]{RegisterCommand.REGISTER};
}
// Check if this datanode should actually be shutdown instead.
if (nodeinfo != null && nodeinfo.isDisallowed()) {
setDatanodeDead(nodeinfo);
throw new DisallowedDatanodeException(nodeinfo);
}
if (nodeinfo == null || !nodeinfo.isAlive) {
return new DatanodeCommand[]{RegisterCommand.REGISTER};
}
//TODO:更新心跳的重要信息
heartbeatManager.updateHeartbeat(nodeinfo, reports,
cacheCapacity, cacheUsed,
xceiverCount, failedVolumes,
volumeFailureSummary);
...
}
第三十四步:在HeartbeatManager.updateHeartbeat()调用DatanodeDescriptor.updateHeartbeat()方法更新心跳信息
synchronized void updateHeartbeat(final DatanodeDescriptor node,
StorageReport[] reports, long cacheCapacity, long cacheUsed,
int xceiverCount, int failedVolumes,
VolumeFailureSummary volumeFailureSummary) {
stats.subtract(node);
//TODO:更新状态
node.updateHeartbeat(reports, cacheCapacity, cacheUsed,
xceiverCount, failedVolumes, volumeFailureSummary);
stats.add(node);
}
第三十五步:在DatanodeDescriptor.updateHeartbeat()方法内再调用updateHeartbeatState()进行心跳更新
public void updateHeartbeat(StorageReport[] reports, long cacheCapacity,
long cacheUsed, int xceiverCount, int volFailures,
VolumeFailureSummary volumeFailureSummary) {
//TODO:更新状态
updateHeartbeatState(reports, cacheCapacity, cacheUsed, xceiverCount,
volFailures, volumeFailureSummary);
heartbeatedSinceRegistration = true;
}
第三十六步:最终在updateHeartbeatState()方法实现所有的心跳信息更新
public void updateHeartbeatState(StorageReport[] reports, long cacheCapacity,
long cacheUsed, int xceiverCount, int volFailures,
VolumeFailureSummary volumeFailureSummary) {
...
//TODO:更新存储信息
setCacheCapacity(cacheCapacity);
setCacheUsed(cacheUsed);
setXceiverCount(xceiverCount);
//TODO:修改上一次心跳时间
setLastUpdate(Time.now());
//TODO:通过心跳来判断一个DataNode是否存活
setLastUpdateMonotonic(Time.monotonicNow());
...
}