Node 代表一个子结点或者一个中间结点。
/** The interface defines a node in a network topology.
* A node may be a leave representing a data node or an inner
* node representing a datacenter or rack.
* Each data has a name and its location in the network is
* decided by a string with syntax similar to a file name.
* For example, a data node's name is hostname:port# and if it's located at
* rack "orange" in datacenter "dog", the string representation of its
* network location is /dog/orange
*/
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
@InterfaceStability.Unstable
public interface Node {
/** @return the string representation of this node's network location */
public String getNetworkLocation();
/** Set this node's network location
* @param location the location
*/
public void setNetworkLocation(String location);
/** @return this node's name */
public String getName();
/** @return this node's parent */
public Node getParent();
/** Set this node's parent
* @param parent the parent
*/
public void setParent(Node parent);
/** @return this node's level in the tree.
* E.g. the root of a tree returns 0 and its children return 1
*/
public int getLevel();
/** Set this node's level in the tree
* @param i the level
*/
public void setLevel(int i);
}
DatanodeId代表一个Datanode结点的基本标识。Datanodes用(hostname,port)的它们的storageID连在一起标识。DatanodeId里有hostName字段和peerHostName字段,hostName是Datanode上报的内容。peerHostName是Namenode根据ipAddr和/etc/hosts的配置反解析过来的。
/**
* This class represents the primary identifier for a Datanode.
* Datanodes are identified by how they can be contacted (hostname
* and ports) and their storage ID, a unique number that associates
* the Datanodes blocks with a particular Datanode.
*
* {@link DatanodeInfo#getName()} should be used to get the network
* location (for topology) of a datanode, instead of using
* {@link DatanodeID#getXferAddr()} here. Helpers are defined below
* for each context in which a DatanodeID is used.
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class DatanodeID implements Comparable<DatanodeID> {
public static final DatanodeID[] EMPTY_ARRAY = {};
private String ipAddr; // IP address
private String hostName; // hostname claimed by datanode
private String peerHostName; // hostname from the actual connection
private int xferPort; // data streaming port
private int infoPort; // info server port
private int infoSecurePort; // info server port
private int ipcPort; // IPC server port
/**
* UUID identifying a given datanode. For upgraded Datanodes this is the
* same as the StorageID that was previously used by this Datanode.
* For newly formatted Datanodes it is a UUID.
*/
private String datanodeUuid = null;
DatanodeInfo 扩展DatanodeID,实现了Node接口,增加了
/**
* This class extends the primary identifier of a Datanode with ephemeral
* state, eg usage information, current administrative state, and the
* network location that is communicated to clients.
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class DatanodeInfo extends DatanodeID implements Node {
private long capacity;
private long dfsUsed;
private long remaining;
private long blockPoolUsed;
private long cacheCapacity;
private long cacheUsed;
private long lastUpdate;
private int xceiverCount;
private String location = NetworkTopology.DEFAULT_RACK;
private String softwareVersion;
private List<String> dependentHostNames = new LinkedList<String>();
protected AdminStates adminState;
private transient int level; //which level of the tree the node resides
private transient Node parent; //its parent
DatanodeDescriptor 继承自DatanodeInfo,有health,capacity,
/**
* This class extends the DatanodeInfo class with ephemeral information (eg
* health, capacity, what blocks are associated with the Datanode) that is
* private to the Namenode, ie this class is not exposed to clients.
*/
有以下内部类,BlockTargetPair存放Block和DatanodeStorageInfo的对照关系,通俗的讲,即数据块在哪个磁盘上存储。
/** Block and targets pair */
@InterfaceAudience.Private
@InterfaceStability.Evolving
public static class BlockTargetPair {
public final Block block;
public final DatanodeStorageInfo[] targets;
BlockTargetPair(Block block, DatanodeStorageInfo[] targets) {
this.block = block;
this.targets = targets;
}
}
以下的内部类是用LinkedList实现的一个BlockQueue
/** A BlockTargetPair queue. */
private static class BlockQueue<E> {
private final Queue<E> blockq = new LinkedList<E>();
/** Size of the queue */
synchronized int size() {return blockq.size();}
/** Enqueue */
synchronized boolean offer(E e) {
return blockq.offer(e);
}
/** Dequeue */
synchronized List<E> poll(int numBlocks) {
if (numBlocks <= 0 || blockq.isEmpty()) {
return null;
}
List<E> results = new ArrayList<E>();
for(; !blockq.isEmpty() && numBlocks > 0; numBlocks--) {
results.add(blockq.poll());
}
return results;
}
/**
* Returns <tt>true</tt> if the queue contains the specified element.
*/
boolean contains(E e) {
return blockq.contains(e);
}
synchronized void clear() {
blockq.clear();
}
}
CachedBlocksList
/**
* A list of CachedBlock objects on this datanode.
*/
public static class CachedBlocksList extends IntrusiveCollection<CachedBlock> {
public enum Type {
PENDING_CACHED,
CACHED,
PENDING_UNCACHED
}
private final DatanodeDescriptor datanode;
private final Type type;
CachedBlocksList(DatanodeDescriptor datanode, Type type) {
this.datanode = datanode;
this.type = type;
}
public DatanodeDescriptor getDatanode() {
return datanode;
}
public Type getType() {
return type;
}
}
处理心跳是一个重要的方法。
/**
* process datanode heartbeat or stats initialization.
*/
public void updateHeartbeatState(StorageReport[] reports, long cacheCapacity,
long cacheUsed, int xceiverCount, int volFailures) {
long totalCapacity = 0;
long totalRemaining = 0;
long totalBlockPoolUsed = 0;
long totalDfsUsed = 0;
Set<DatanodeStorageInfo> failedStorageInfos = null;
// Decide if we should check for any missing StorageReport and mark it as
// failed. There are different scenarios.
// 1. When DN is running, a storage failed. Given the current DN
// implementation doesn't add recovered storage back to its storage list
// until DN restart, we can assume volFailures won't decrease
// during the current DN registration session.
// When volumeFailures == this.volumeFailures, it implies there is no
// state change. No need to check for failed storage. This is an
// optimization.
// 2. After DN restarts, volFailures might not increase and it is possible
// we still have new failed storage. For example, admins reduce
// available storages in configuration. Another corner case
// is the failed volumes might change after restart; a) there
// is one good storage A, one restored good storage B, so there is
// one element in storageReports and that is A. b) A failed. c) Before
// DN sends HB to NN to indicate A has failed, DN restarts. d) After DN
// restarts, storageReports has one element which is B.
boolean checkFailedStorages = (volFailures > this.volumeFailures) ||
!heartbeatedSinceRegistration;
if (checkFailedStorages) {//默认每种storage都是失效的,然后如果上报这个storage的信息,再去掉这个storage.
LOG.info("Number of failed storage changes from "
+ this.volumeFailures + " to " + volFailures);
failedStorageInfos = new HashSet<DatanodeStorageInfo>(
storageMap.values());
}
setCacheCapacity(cacheCapacity);
setCacheUsed(cacheUsed);
setXceiverCount(xceiverCount);
setLastUpdate(Time.now());
this.volumeFailures = volFailures;
for (StorageReport report : reports) {
DatanodeStorageInfo storage = updateStorage(report.getStorage());
if (checkFailedStorages) {//每一个上报的storage代表这个storage正常工作。
failedStorageInfos.remove(storage);
}
storage.receivedHeartbeat(report);
totalCapacity += report.getCapacity();
totalRemaining += report.getRemaining();
totalBlockPoolUsed += report.getBlockPoolUsed();
totalDfsUsed += report.getDfsUsed();
}
rollBlocksScheduled(getLastUpdate());
// Update total metrics for the node.
setCapacity(totalCapacity);
setRemaining(totalRemaining);
setBlockPoolUsed(totalBlockPoolUsed);
setDfsUsed(totalDfsUsed);
if (checkFailedStorages) {
updateFailedStorage(failedStorageInfos);
}
}
在面有一个循环,调用DatanodeStorageInfo storage = updateStorage(report.getStorage());
updateStorage方法的代码如下:
DatanodeStorageInfo updateStorage(DatanodeStorage s) {
synchronized (storageMap) {
DatanodeStorageInfo storage = storageMap.get(s.getStorageID());
if (storage == null) {
LOG.info("Adding new storage ID " + s.getStorageID() +
" for DN " + getXferAddr());
storage = new DatanodeStorageInfo(this, s);
storageMap.put(s.getStorageID(), storage);
} else if (storage.getState() != s.getState() ||
storage.getStorageType() != s.getStorageType()) {
// For backwards compatibility, make sure that the type and
// state are updated. Some reports from older datanodes do
// not include these fields so we may have assumed defaults.
// This check can be removed in the next major release after
// 2.4.
storage.updateFromStorage(s);
storageMap.put(storage.getStorageID(), storage);
}
return storage;
}
}
然后调用rollBlocksScheduled,默认计算10分钟内该datanode每种存储类型被安排写数据块的次数,代码如下:
/** Adjusts curr and prev number of blocks scheduled every few minutes. */
private void rollBlocksScheduled(long now) {
if (now - lastBlocksScheduledRollTime > BLOCKS_SCHEDULED_ROLL_INTERVAL) {
prevApproxBlocksScheduled.set(currApproxBlocksScheduled);
currApproxBlocksScheduled.reset();
lastBlocksScheduledRollTime = now;
}
}
相关变量定义如下:
/* Variables for maintaining number of blocks scheduled to be written to
* this storage. This count is approximate and might be slightly bigger
* in case of errors (e.g. datanode does not report if an error occurs
* while writing the block).
*/
private EnumCounters<StorageType> currApproxBlocksScheduled
= new EnumCounters<StorageType>(StorageType.class);
private EnumCounters<StorageType> prevApproxBlocksScheduled
= new EnumCounters<StorageType>(StorageType.class);
得到一个Datanode的BlockIterator的方法是
Iterator<BlockInfo> getBlockIterator() {
return new BlockIterator(getStorageInfos());
}
BlockIterator类是一个静态内部类,比较好玩,因为它的每个元素同时又是一个Iterator对象。
private static class BlockIterator implements Iterator<BlockInfo> {
private int index = 0;
private final List<Iterator<BlockInfo>> iterators;
private BlockIterator(final DatanodeStorageInfo... storages) {
List<Iterator<BlockInfo>> iterators = new ArrayList<Iterator<BlockInfo>>();
for (DatanodeStorageInfo e : storages) {
iterators.add(e.getBlockIterator());
}
this.iterators = Collections.unmodifiableList(iterators);
}
@Override
public boolean hasNext() {
update();
return !iterators.isEmpty() && iterators.get(index).hasNext();
}
@Override
public BlockInfo next() {
update();
return iterators.get(index).next();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Remove unsupported.");
}
private void update() {
while(index < iterators.size() - 1 && !iterators.get(index).hasNext()) {
index++;
}
}
}
有DecommissioningStatus内部类
/** Decommissioning status */
public class DecommissioningStatus {
private int underReplicatedBlocks;
private int decommissionOnlyReplicas;
private int underReplicatedInOpenFiles;
private long startTime;
synchronized void set(int underRep,
int onlyRep, int underConstruction) {
if (isDecommissionInProgress() == false) {
return;
}
underReplicatedBlocks = underRep;
decommissionOnlyReplicas = onlyRep;
underReplicatedInOpenFiles = underConstruction;
}
/** @return the number of under-replicated blocks */
public synchronized int getUnderReplicatedBlocks() {
if (isDecommissionInProgress() == false) {
return 0;
}
return underReplicatedBlocks;
}
/** @return the number of decommission-only replicas */
public synchronized int getDecommissionOnlyReplicas() {
if (isDecommissionInProgress() == false) {
return 0;
}
return decommissionOnlyReplicas;
}
/** @return the number of under-replicated blocks in open files */
public synchronized int getUnderReplicatedInOpenFiles() {
if (isDecommissionInProgress() == false) {
return 0;
}
return underReplicatedInOpenFiles;
}
/** Set start time */
public synchronized void setStartTime(long time) {
startTime = time;
}
/** @return start time */
public synchronized long getStartTime() {
if (isDecommissionInProgress() == false) {
return 0;
}
return startTime;
}
}