1 概述
上文主要了解了dn数据存储的存储管理,这里主要分析一下数据块的管理。
从上一篇我们知道,在dn数据存储中,DataStorage管理dn提供的数据目录,一个数据目录对应一台机器上的数据卷。
FsDatasetImpl会通过持有一个 FsVolumeList 对象对 Datanode 上定义的所有存储目录下的数据块进行管理与操作,而 FsVolumeList 对象统一管理 Datanode 上定义的多个 FsVolumelmpl 对象。FsDatasetlmpl 还会持有一个DataStorage 对象,从而实现对 Datanode 的存储空间进行操作。同时 FsDatasetImpl 会通过持有一个 ReplicaMap 对象维护 Datanode 上所有副本的信息与状态。
类继承关系:
2 FsDatasetImpl
2.1 FsDatasetSpi接口
由上图可以知道,FsDatasetImpl是实现的FsDatasetSpi接口,因此这里我们先看看此接口中方法分类:
- 操作卷
/**
* Returns a list of FsVolumes that hold reference counts.
*
* The caller must release the reference of each volume by calling
* {@link FsVolumeReferences#close()}.
*/
FsVolumeReferences getFsVolumeReferences();
/**
* Add a new volume to the FsDataset.
*
* If the FSDataset supports block scanning, this function registers
* the new volume with the block scanner.
*
* @param location The storage location for the new volume.
* @param nsInfos Namespace information for the new volume.
*/
void addVolume(
final StorageLocation location,
final List<NamespaceInfo> nsInfos) throws IOException;
/**
* Removes a collection of volumes from FsDataset.
*
* If the FSDataset supports block scanning, this function removes
* the volumes from the block scanner.
*
* @param volumes The paths of the volumes to be removed.
* @param clearFailure set true to clear the failure information about the
* volumes.
*/
void removeVolumes(Collection<StorageLocation> volumes, boolean clearFailure);
/** @return the volume that contains a replica of the block. */
V getVolume(ExtendedBlock b);
/** @return a volume information map (name {@literal =>} info). */
Map<String, Object> getVolumeInfoMap();
/**
* Returns info about volume failures.
*
* @return info about volume failures, possibly null
*/
VolumeFailureSummary getVolumeFailureSummary();
- 操作数据块
/**
* Gets a list of references to the finalized blocks for the given block pool.
* <p>
* Callers of this function should call
* {@link FsDatasetSpi#acquireDatasetLock} to avoid blocks' status being
* changed during list iteration.
* </p>
* @return a list of references to the finalized blocks for the given block
* pool.
*/
List<ReplicaInfo> getFinalizedBlocks(String bpid);
/**
* @return the generation stamp stored with the block.
*/
Block getStoredBlock(String bpid, long blkid) throws IOException;
/**
* Returns an input stream at specified offset of the specified block.
* @param b block
* @param seekOffset offset with in the block to seek to
* @return an input stream to read the contents of the specified block,
* starting at the offset
* @throws IOException
*/
InputStream getBlockInputStream(ExtendedBlock b, long seekOffset)
throws IOException;
/**
* Finalizes the block previously opened for writing using writeToBlock.
* The block size is what is in the parameter b and it must match the amount
* of data written
* @param b Block to be finalized
* @param fsyncDir whether to sync the directory changes to durable device.
* @throws IOException
* @throws ReplicaNotFoundException if the replica can not be found when the
* block is been finalized. For instance, the block resides on an HDFS volume
* that has been removed.
*/
void finalizeBlock(ExtendedBlock b, boolean fsyncDir) throws IOException;
/**
* Unfinalizes the block previously opened for writing using writeToBlock.
* The temporary file associated with this block is deleted.
* @throws IOException
*/
void unfinalizeBlock(ExtendedBlock b) throws IOException;
/**
* Check if a block is valid.
*
* @param b The block to check.
* @param minLength The minimum length that the block must have. May be 0.
* @param state If this is null, it is ignored. If it is non-null, we
* will check that the replica has this state.
*
* @throws ReplicaNotFoundException If the replica is not found
*
* @throws UnexpectedReplicaStateException If the replica is not in the
* expected state.
* @throws FileNotFoundException If the block file is not found or there
* was an error locating it.
* @throws EOFException If the replica length is too short.
*
* @throws IOException May be thrown from the methods called.
*/
void checkBlock(ExtendedBlock b, long minLength, ReplicaState state)
throws ReplicaNotFoundException, UnexpectedReplicaStateException,
FileNotFoundException, EOFException, IOException;
/**
* Is the block valid?
* @return - true if the specified block is valid
*/
boolean isValidBlock(ExtendedBlock b);
- 操作缓存
/**
* Caches the specified blocks
* @param bpid Block pool id
* @param blockIds - block ids to cache
*/
void cache(String bpid, long[] blockIds);
/**
* Uncaches the specified blocks
* @param bpid Block pool id
* @param blockIds - blocks ids to uncache
*/
void uncache(String bpid, long[] blockIds);
/**
* Determine if the specified block is cached.
* @param bpid Block pool id
* @param blockId - block id
* @return true if the block is cached
*/
boolean isCached(String bpid, long blockId);
- 操作块池
/**
* add new block pool ID
* @param bpid Block pool Id
* @param conf Configuration
*/
void addBlockPool(String bpid, Configuration conf) throws IOException;
/**
* Shutdown and remove the block pool from underlying storage.
* @param bpid Block pool Id to be removed
*/
void shutdownBlockPool(String bpid) ;
/**
* Deletes the block pool directories. If force is false, directories are
* deleted only if no block files exist for the block pool. If force
* is true entire directory for the blockpool is deleted along with its
* contents.
* @param bpid BlockPool Id to be deleted.
* @param force If force is false, directories are deleted only if no
* block files exist for the block pool, otherwise entire
* directory for the blockpool is deleted along with its contents.
* @throws IOException
*/
void deleteBlockPool(String bpid, boolean force) throws IOException;
- 回收相关
/**
* Enable 'trash' for the given dataset. When trash is enabled, files are
* moved to a separate trash directory instead of being deleted immediately.
* This can be useful for example during rolling upgrades.
*/
void enableTrash(String bpid);
/**
* Clear trash
*/
void clearTrash(String bpid);
/**
* @return true when trash is enabled
*/
boolean trashEnabled(String bpid);
当然还有些其他的方法,这里没有列举。
2.2 构造函数
FsDatasetImpl实例是在initStorage中创建(可以参见)
synchronized(this) {
if (data == null) {
data = factory.newInstance(this, storage, getConf());
}
}
因此我们来看看FsDatasetImpl的构造函数:
/**
* An FSDataset has a directory where it loads its data files.
*/
FsDatasetImpl(DataNode datanode, DataStorage storage, Configuration conf
) throws IOException {
this.fsRunning = true;
this.datanode = datanode;
this.dataStorage = storage;
this.conf = conf;
// 设置最小读取或写入序列化文件的缓冲大小:io.file.buffer.size
this.smallBufferSize = DFSUtilClient.getSmallBufferSize(conf);
// 创建一个可重入读写锁
this.datasetRWLock = new InstrumentedReadWriteLock(
conf.getBoolean(DFSConfigKeys.DFS_DATANODE_LOCK_FAIR_KEY,
DFSConfigKeys.DFS_DATANODE_LOCK_FAIR_DEFAULT),
"FsDatasetRWLock", LOG, conf.getTimeDuration(
DFSConfigKeys.DFS_LOCK_SUPPRESS_WARNING_INTERVAL_KEY,
DFSConfigKeys.DFS_LOCK_SUPPRESS_WARNING_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS),
conf.getTimeDuration(
DFSConfigKeys.DFS_DATANODE_LOCK_REPORTING_THRESHOLD_MS_KEY,
DFSConfigKeys.DFS_DATANODE_LOCK_REPORTING_THRESHOLD_MS_DEFAULT,
TimeUnit.MILLISECONDS));
this.datasetWriteLock = new AutoCloseableLock(datasetRWLock.writeLock());
this.datasetReadLock = new AutoCloseableLock(datasetRWLock.readLock());
this.datasetWriteLockCondition = datasetWriteLock.newCondition();
// The number of volumes required for operation is the total number
// of volumes minus the number of failed volumes we can tolerate.
// 在dn停止服务前允许失败的卷数
volFailuresTolerated = datanode.getDnConf().getVolFailuresTolerated();
Collection<StorageLocation> dataLocations = DataNode.getStorageLocations(conf);
// 获取卷异常信息
List<VolumeFailureInfo> volumeFailureInfos = getInitialVolumeFailureInfos(
dataLocations, storage);
volsConfigured = datanode.getDnConf().getVolsConfigured();
int volsFailed = volumeFailureInfos.size();
if (volFailuresTolerated < DataNode.MAX_VOLUME_FAILURE_TOLERATED_LIMIT
|| volFailuresTolerated >= volsConfigured) {
throw new HadoopIllegalArgumentException("Invalid value configured for "
+ "dfs.datanode.failed.volumes.tolerated - " + volFailuresTolerated
+ ". Value configured is either less than maxVolumeFailureLimit or greater than "
+ "to the number of configured volumes (" + volsConfigured + ").");
}
if (volFailuresTolerated == DataNode.MAX_VOLUME_FAILURE_TOLERATED_LIMIT) {
if (volsConfigured == volsFailed) {
throw new DiskErrorException(
"Too many failed volumes - " + "current valid volumes: "
+ storage.getNumStorageDirs() + ", volumes configured: "
+ volsConfigured + ", volumes failed: " + volsFailed
+ ", volume failures tolerated: " + volFailuresTolerated);
}
} else {
if (volsFailed > volFailuresTolerated) {
throw new DiskErrorException(
"Too many failed volumes - " + "current valid volumes: "
+ storage.getNumStorageDirs() + ", volumes configured: "
+ volsConfigured + ", volumes failed: " + volsFailed
+ ", volume failures tolerated: " + volFailuresTolerated);
}
}
// 创建storageMap,key为StorageDirectory.uuid,value为DataStorage
storageMap = new ConcurrentHashMap<String, DatanodeStorage>();
// 创建ReplicaMap,维护Datanode上所有数据块副本的状态
// ReplicaMap 维护一个Map,保存 Datanode 上所有块池与这个块池保存的数据块副本信息的映射关系。
// 数据块副本的信息是通过 ReplicaInfo 类描述的
volumeMap = new ReplicaMap(datasetRWLock);
ramDiskReplicaTracker = RamDiskReplicaTracker.getInstance(conf, this);
@SuppressWarnings("unchecked")
// 创建一个卷选择器对象
final VolumeChoosingPolicy<FsVolumeImpl> blockChooserImpl =
ReflectionUtils.newInstance(conf.getClass(
DFSConfigKeys.DFS_DATANODE_FSDATASET_VOLUME_CHOOSING_POLICY_KEY,
RoundRobinVolumeChoosingPolicy.class,
VolumeChoosingPolicy.class), conf);
// 创建卷list,是一个FsVolumeImpl的集合
volumes = new FsVolumeList(volumeFailureInfos, datanode.getBlockScanner(),
blockChooserImpl);
// 创建一个FsDatasetAsyncDiskService实例,可以看作是一个线程池,对应各个卷,便于完成磁盘的异步操作
asyncDiskService = new FsDatasetAsyncDiskService(datanode, this);
// RamDiskAsyncLazyPersistService,作用和FsDatasetAsyncDiskService类似,不过作用的policy为lazy_persist
asyncLazyPersistService = new RamDiskAsyncLazyPersistService(datanode, conf);
deletingBlock = new HashMap<String, Set<Long>>();
// 添加与激活存储卷
for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
addVolume(storage.getStorageDir(idx));
}
setupAsyncLazyPersistThreads();
cacheManager = new FsDatasetCache(this);
// Start the lazy writer once we have built the replica maps.
// We need to start the lazy writer even if MaxLockedMemory is set to
// zero because we may have un-persisted replicas in memory from before
// the process restart. To minimize the chances of data loss we'll
// ensure they get written to disk now.
if (ramDiskReplicaTracker.numReplicasNotPersisted() > 0 ||
datanode.getDnConf().getMaxLockedMemory() > 0) {
lazyWriter = new Daemon(new LazyWriter(conf));
lazyWriter.start();
} else {
lazyWriter = null;
}
registerMBean(datanode.getDatanodeUuid());
// Add a Metrics2 Source Interface. This is same
// data as MXBean. We can remove the registerMbean call
// in a release where we can break backward compatibility
MetricsSystem ms = DefaultMetricsSystem.instance();
ms.register("FSDatasetState", "FSDatasetState", this);
localFS = FileSystem.getLocal(conf);
blockPinningEnabled = conf.getBoolean(
DFSConfigKeys.DFS_DATANODE_BLOCK_PINNING_ENABLED,
DFSConfigKeys.DFS_DATANODE_BLOCK_PINNING_ENABLED_DEFAULT);
maxDataLength = conf.getInt(
CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH,
CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH_DEFAULT);
}
具体的方法这里不做更多介绍,主要是实现FsDatasetSpi接口中的方法。
2.3 主要变量
// 当前数据节点对象的引用
final DataNode datanode;
// DataStorage对象的引用,提供了管理存储空间的能力
final DataStorage dataStorage;
// 对象统一管理 Datanode 上定义的多个 FsVolumelmpl 对象
private final FsVolumeList volumes;
// 在FsDatasetImpl中, 对于每一个存储目录都会构造一个DataStorage对象,
// storageMap字段就是用来保存这些存储的, 是storageUuid->DataStorage的映射。
// FsDatasetImpl可以通过storageUuid获取对应的DataStorage对象。
final Map<String, DatanodeStorage> storageMap;
final FsDatasetAsyncDiskService asyncDiskService;
final Daemon lazyWriter;
// FsDatasetCache类型, 用于将数据块缓存到内存中的工具类
final FsDatasetCache cacheManager;
private final Configuration conf;
private final int volFailuresTolerated;
private final int volsConfigured;
private volatile boolean fsRunning;
// ReplicaMap类型, 用于记录Datanode上所有数据块副本的信息
final ReplicaMap volumeMap;
final Map<String, Set<Long>> deletingBlock;
final RamDiskReplicaTracker ramDiskReplicaTracker;
// 为每个volume提供一个只有一个线程的线程池,用作异步内存磁盘(ramdisk)操作,
final RamDiskAsyncLazyPersistService asyncLazyPersistService;
private static final int MAX_BLOCK_EVICTIONS_PER_ITERATION = 3;
private final int smallBufferSize;
final LocalFileSystem localFS;
private boolean blockPinningEnabled;
private final int maxDataLength;
@VisibleForTesting
final AutoCloseableLock datasetWriteLock;
@VisibleForTesting
final AutoCloseableLock datasetReadLock;
@VisibleForTesting
final InstrumentedReadWriteLock datasetRWLock;
private final Condition datasetWriteLockCondition;
private static String blockPoolId = "";
3 FsDatasetImpl变量详解
这里详细介绍几个重要的变量。
3.1 FsVolumeList
FsVolumeList主要由FsVolumeImpl构成
class FsVolumeList {
// ArrayList的线程安全变种,元素的类型为FsVolumeImpl
private final CopyOnWriteArrayList<FsVolumeImpl> volumes =
new CopyOnWriteArrayList<>();
// Tracks volume failures, sorted by volume path.
// map from volume storageID to the volume failure info
// 记录数据卷异常的原因
private final Map<StorageLocation, VolumeFailureInfo> volumeFailureInfos =
Collections.synchronizedMap(
new TreeMap<StorageLocation, VolumeFailureInfo>());
// 将要被删除的FsVolumeImpl
private final ConcurrentLinkedQueue<FsVolumeImpl> volumesBeingRemoved =
new ConcurrentLinkedQueue<>();
private final AutoCloseableLock checkDirsLock;
private final Condition checkDirsLockCondition;
private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
private final BlockScanner blockScanner;
FsVolumeList(List<VolumeFailureInfo> initialVolumeFailureInfos,
BlockScanner blockScanner,
VolumeChoosingPolicy<FsVolumeImpl> blockChooser) {
this.blockChooser = blockChooser;
this.blockScanner = blockScanner;
this.checkDirsLock = new AutoCloseableLock();
this.checkDirsLockCondition = checkDirsLock.newCondition();
for (VolumeFailureInfo volumeFailureInfo: initialVolumeFailureInfos) {
volumeFailureInfos.put(volumeFailureInfo.getFailedStorageLocation(),
volumeFailureInfo);
}
}
...
}
3.2 ReplicaMap
ReplicaMap 保存了 Datanode 上所有数据块副本的信息。ReplicaMap 维护一个Map,保存 Datanode 上所有块池与这个块池保存的数据块副本信息的映射关系。数据块副本的信息是通过 ReplicaInfo 类描述的。
在ReplicaMap中主要的一个变量为:
// Map of block pool Id to a set of ReplicaInfo.
// key为BlockPoolId,value为一系列ReplicaInfo
private final Map<String, FoldedTreeSet<ReplicaInfo>> map = new HashMap<>();
3.2.1 ReplicaState
由于Replica是副本相关,因此有必要先了解一下ReplicaState,即副本状态,此对象是一个枚举:
enum ReplicaState {
/** Replica is finalized. The state when replica is not modified. */
// Datanode定义了FinalizedReplica类来保存FINALIZED状态副本的信息
FINALIZED(0),
/** Replica is being written to. */
// 刚刚由HDFS客户端创建的副本或者进行追加写(append) 操作的副本,
// 副本的数据正在被写入, 部分数据对于客户端是可见的。
// Datanode定义了ReplicaBeingWritten类来保存RBW状态副本的信息
RBW(1),
/** Replica is waiting to be recovered. */
// 如果Datanode重启或者宕机, 所有RBW状态的副本在Datanode重启后都将被加载为RWR状态,
RWR会等待块恢复操作。 Datanode定义了ReplicaWaitingToBeRecovered类来保存RWR状态副本的信息
RWR(2),
/** Replica is under recovery. */
// 进行块恢复时的副本。 Datanode定义了ReplicaUnderRecovery类来保存RUR状态副本的信息
RUR(3),
/** Temporary replica: created for replication and relocation only. */
// Datanode之间复制数据块, 或者进行集群数据块平衡操作(cluster balance) 时,
// 正在写入副本的状态就是TEMPORARY状态。 和RBW不同的是, TEMPORARY状态的副本对于
// 客户端是不可见的, 同时Datanode重启时将会直接删除处于TEMPORARY状态的副本。
// Datanode定义了ReplicaInPipeline类来保存TEMPORARY状态副本的信息。
TEMPORARY(4);
...
}
3.2.2 ReplicaInfo
ReplicaInfo 是一个抽象类,用来描述 Datanode 上保存的数据块副本信息。ReplicaInfo 类对外提供了getBlockFile()、getMetaFile()、getVolume()、getStorageUuid() 等方法,分别用于获取副本的数据块文件、校验文件、副本所在存储目录的FsVolumeImpl对象以及副本所在存储目录的 storageUuid 信息。
ReplicaInfo 有如下几个子类分别用来描述处于不同状态的数据块副本。
- ReplicaBeingWritten 对应 RBW 状态的副本,副本正通过数据流管道写入 Datanode.
- ReplicaUnderRecovery 对应 RUR 状态的副本,副本正在进行恢复操作。
- ReplicaWaitingToBeRecovered 对应 RWR 状态的副本,副本等待恢复操作。
- FinalizedReplica 对应 FINALIZED 状态的副本,副本已!经完成写操作,并且已经提交
3.3 FsVolumeImpl
FsVolumeImpl是FsVolumeList的元素类型,FsVolumeImpl对应一个数据目录,因此在这里介绍下
3.3.1 变量介绍
// 这个字段在这里主要用于加锁操作
private final FsDatasetImpl dataset;
// 当前存储目录对应的StorageDirectory的storageID。
private final String storageID;
private final StorageType storageType;
// 当前FsVolumeImpl下所有BlockPoolSlice的引用, 是blockPoolId->BlockPoolSlice的映射。
private final Map<String, BlockPoolSlice> bpSlices = new ConcurrentHashMap<String, BlockPoolSlice>();
// Refers to the base StorageLocation used to construct this volume
// (i.e., does not include STORAGE_DIR_CURRENT in
// <location>/STORAGE_DIR_CURRENT/)
private final StorageLocation storageLocation;
// 当前存储目录下current文件夹的引用。
private final File currentDir; // <StorageDirectory>/current
// 当前存储目录的磁盘使用情况。
private final DF usage;
private final ReservedSpaceCalculator reserved;
private CloseableReferenceCount reference = new CloseableReferenceCount();
// Disk space reserved for blocks (RBW or Re-replicating) open for write.
private AtomicLong reservedForReplicas;
// 当前存储目录的预留磁盘空间大小。
private long recentReserved = 0;
private final Configuration conf;
// Capacity configured. This is useful when we want to
// limit the visible capacity for tests. If negative, then we just
// query from the filesystem.
protected volatile long configuredCapacity;
private final FileIoProvider fileIoProvider;
private final DataNodeVolumeMetrics metrics;
/**
* 线程池, 用来处理添加到缓存中的新的数据块。
*
* Per-volume worker pool that processes new blocks to cache.
* The maximum number of workers per volume is bounded (configurable via
* dfs.datanode.fsdatasetcache.max.threads.per.volume) to limit resource
* contention.
*/
protected ThreadPoolExecutor cacheExecutor;
3.3.2 构造函数
// 此构造函数主要是完成上面提及的变量的初始化
FsVolumeImpl(FsDatasetImpl dataset, String storageID, StorageDirectory sd,
FileIoProvider fileIoProvider, Configuration conf, DF usage)
throws IOException {
if (sd.getStorageLocation() == null) {
throw new IOException("StorageLocation specified for storage directory " +
sd + " is null");
}
this.dataset = dataset;
this.storageID = storageID;
this.reservedForReplicas = new AtomicLong(0L);
this.storageLocation = sd.getStorageLocation();
this.currentDir = sd.getCurrentDir();
this.storageType = storageLocation.getStorageType();
this.configuredCapacity = -1;
this.usage = usage;
if (this.usage != null) {
reserved = new ReservedSpaceCalculator.Builder(conf)
.setUsage(this.usage).setStorageType(storageType).build();
boolean fixedSizeVolume = conf.getBoolean(
DFSConfigKeys.DFS_DATANODE_FIXED_VOLUME_SIZE_KEY,
DFSConfigKeys.DFS_DATANODE_FIXED_VOLUME_SIZE_DEFAULT);
if (fixedSizeVolume) {
cachedCapacity = this.usage.getCapacity();
}
} else {
reserved = null;
LOG.warn("Setting reserved to null as usage is null");
cachedCapacity = -1;
}
if (currentDir != null) {
File parent = currentDir.getParentFile();
cacheExecutor = initializeCacheExecutor(parent);
this.metrics = DataNodeVolumeMetrics.create(conf, parent.getPath());
} else {
cacheExecutor = null;
this.metrics = null;
}
this.conf = conf;
this.fileIoProvider = fileIoProvider;
}
3.3.3 往FsVolumeList添加FsVolumeImpl
此操作最开始是在FsDatasetImpl构造函数中完成:
// 添加与激活存储卷
for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
addVolume(storage.getStorageDir(idx));
}
addVolume中会创建对应的FsVolumeImpl实例,而后创建对应的实例引用,而后调用activateVolume方法激活volume:
private void addVolume(Storage.StorageDirectory sd) throws IOException {
final StorageLocation storageLocation = sd.getStorageLocation();
// If IOException raises from FsVolumeImpl() or getVolumeMap(), there is
// nothing needed to be rolled back to make various data structures, e.g.,
// storageMap and asyncDiskService, consistent.
// 通过建造者设计模式创建FsVolumeImpl实例
FsVolumeImpl fsVolume = new FsVolumeImplBuilder()
.setDataset(this)
.setStorageID(sd.getStorageUuid())
.setStorageDirectory(sd)
.setFileIoProvider(datanode.getFileIoProvider())
.setConf(this.conf)
.build();
// 创建数据卷引用
FsVolumeReference ref = fsVolume.obtainReference();
ReplicaMap tempVolumeMap = new ReplicaMap(datasetRWLock);
// 往数据卷中填充volumeMap
// 先从缓存中获取副本进行填充,如果缓存中填充失败的话,通过ForkJoin(RecursiveAction)往volumeMap填充
fsVolume.getVolumeMap(tempVolumeMap, ramDiskReplicaTracker);
// 激活数据卷
activateVolume(tempVolumeMap, sd, storageLocation.getStorageType(), ref);
LOG.info("Added volume - " + storageLocation + ", StorageType: " +
storageLocation.getStorageType());
}
在activateVolume中,激活volume以供提供服务:
/**
* Activate a volume to serve requests.
* @throws IOException if the storage UUID already exists.
*/
private void activateVolume(
ReplicaMap replicaMap,
Storage.StorageDirectory sd, StorageType storageType,
FsVolumeReference ref) throws IOException {
try (AutoCloseableLock lock = datasetWriteLock.acquire()) {
// 判断storageMap是否添加了当前数据卷对应的storage
DatanodeStorage dnStorage = storageMap.get(sd.getStorageUuid());
if (dnStorage != null) {
final String errorMsg = String.format(
"Found duplicated storage UUID: %s in %s.",
sd.getStorageUuid(), sd.getVersionFile());
LOG.error(errorMsg);
throw new IOException(errorMsg);
}
// 往volumeMap中添加所有replica
volumeMap.mergeAll(replicaMap);
// 往storageMap中添加数据卷对应的storage
storageMap.put(sd.getStorageUuid(),
new DatanodeStorage(sd.getStorageUuid(),
DatanodeStorage.State.NORMAL,
storageType));
asyncDiskService.addVolume((FsVolumeImpl) ref.getVolume());
// 往FsVolumeList添加当前对应卷的引用
volumes.addVolume(ref);
}
}
4 BlockPoolSlice
BlockPoolSlice在FsVolumeImpl中引用,在这里使用map定义:
private final Map<String, BlockPoolSlice> bpSlices
= new ConcurrentHashMap<String, BlockPoolSlice>();
BlockPoolSlice类负责管理指定存储目录下一个指定block pool 的所有数据块。block pool在每个存储目录下都会有一个block pool 目录存储数据块,换句话说, BlockPollSlice管理的就是这个block pool 目录的所有数据块。
一个块池目录存在current、finalized、rbw、lazypersist、tmp等目录。
current目录包含finalized、 rbw以及lazyPersisit三个子目录。
finalized目录保存了所有FINALIZED状态的副本, rbw目录保存了RBW(正在写) 、 RWR(等待恢复) 、 RUR(恢复中) 状态的副本, tmp目录保存了TEMPORARY状态的副本。
当在客户端发起写请求而创建一个新的副本时, 这个副本将会被放到rbw目录中。
当在数据块备份和集群平衡存储过程中创建一个新的副本时, 这个副本就会放到tmp目录中。
一旦一个副本完成写操作并被提交, 它就会被移动到finalized目录中。
当Datanode重启时, tmp目录中的所有副本将会被删除, rbw目录中的副本将会被加载为RWR状态, finalized目录中的副本将会被加载为FINALIZED状态。
lazyPersist目录为HDFS 2.6版本新加入的, 用来支持在内存中写入临时数据块副本以及慢持久化(lazypersist) 数据块副本的特性, lazyPersist目录就是用于保存这些慢持久化的数据块副本的。
finalized子目录的结构比较特殊, 它即包含目录也包含文件。 finalized子目录存储了两种类型的文件。
4.1 BlockPoolSlice构造函数
BlockPoolSlice的构造方法首先会创建current目录、 finalized目录、 lazyPersist目录、rbw目录以及tmp目录, 并将这些目录的引用赋值到对应的字段上。完成子目录的创建之后, 构造方法会初始化dfsUsage字段, 用来统计块池目录的磁盘使用情况。 最后构造方法会添加一个钩子, 当Datanode进程结束时保存整个文件系统的磁盘使用信息。
/**
* Create a blook pool slice
* @param bpid Block pool Id
* @param volume {@link FsVolumeImpl} to which this BlockPool belongs to
* @param bpDir directory corresponding to the BlockPool
* @param conf configuration
* @param timer include methods for getting time
* @throws IOException
*/
BlockPoolSlice(String bpid, FsVolumeImpl volume, File bpDir,
Configuration conf, Timer timer) throws IOException {
// blockPoolId
this.bpid = bpid;
// 对应的数据卷
this.volume = volume;
// 该类抽象出DataNode执行的各种文件IO操作,
// 并在每个文件IO之前和之后调用分析(用于收集统计信息)和故障注入(用于测试)事件挂钩。
this.fileIoProvider = volume.getFileIoProvider();
// 创建current目录
this.currentDir = new File(bpDir, DataStorage.STORAGE_DIR_CURRENT);
// 创建finalized目录
this.finalizedDir = new File(
currentDir, DataStorage.STORAGE_DIR_FINALIZED);
// 创建lazypersist目录
this.lazypersistDir = new File(currentDir, DataStorage.STORAGE_DIR_LAZY_PERSIST);
if (!this.finalizedDir.exists()) {
if (!this.finalizedDir.mkdirs()) {
throw new IOException("Failed to mkdirs " + this.finalizedDir);
}
}
// 文件缓冲大小
this.ioFileBufferSize = DFSUtilClient.getIoFileBufferSize(conf);
// 是否删除重复副本:true
this.deleteDuplicateReplicas = conf.getBoolean(
DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION,
DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION_DEFAULT);
// 缓存中的dfsused检查间隔时间:10min
this.cachedDfsUsedCheckTime =
conf.getLong(
DFSConfigKeys.DFS_DN_CACHED_DFSUSED_CHECK_INTERVAL_MS,
DFSConfigKeys.DFS_DN_CACHED_DFSUSED_CHECK_INTERVAL_DEFAULT_MS);
// 服务器将接受的最大请求大小:128mb
this.maxDataLength = conf.getInt(
CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH,
CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH_DEFAULT);
this.timer = timer;
// Files that were being written when the datanode was last shutdown
// are now moved back to the data directory. It is possible that
// in the future, we might want to do some sort of datanode-local
// recovery for these blocks. For example, crc validation.
// 创建tmp目录
this.tmpDir = new File(bpDir, DataStorage.STORAGE_DIR_TMP);
if (tmpDir.exists()) {
fileIoProvider.fullyDelete(volume, tmpDir);
}
// 创建rbw目录
this.rbwDir = new File(currentDir, DataStorage.STORAGE_DIR_RBW);
// create the rbw and tmp directories if they don't exist.
fileIoProvider.mkdirs(volume, rbwDir);
fileIoProvider.mkdirs(volume, tmpDir);
// 副本缓存的根目录,默认就是current目录
String cacheDirRoot = conf.get(
DFSConfigKeys.DFS_DATANODE_REPLICA_CACHE_ROOT_DIR_KEY);
if (cacheDirRoot != null && !cacheDirRoot.isEmpty()) {
this.replicaCacheDir = new File(cacheDirRoot,
currentDir.getCanonicalPath());
if (!this.replicaCacheDir.exists()) {
if (!this.replicaCacheDir.mkdirs()) {
throw new IOException("Failed to mkdirs " + this.replicaCacheDir);
}
}
} else {
this.replicaCacheDir = currentDir;
}
// 副本缓存过期时间
this.replicaCacheExpiry = conf.getTimeDuration(
DFSConfigKeys.DFS_DATANODE_REPLICA_CACHE_EXPIRY_TIME_KEY,
DFSConfigKeys.DFS_DATANODE_REPLICA_CACHE_EXPIRY_TIME_DEFAULT,
TimeUnit.MILLISECONDS);
// Use cached value initially if available. Or the following call will
// block until the initial du command completes.
// 已使用的dfs空间
// dfsUsage字段是DU类型的,
// DU类是用于统计磁盘使用情况的工具类。是通过shell的方式获取操作系统的磁盘使用情况
// du -sk 这就是shell使用的命令
this.dfsUsage = new FSCachingGetSpaceUsed.Builder().setBpid(bpid)
.setVolume(volume)
.setPath(bpDir)
.setConf(conf)
.setInitialUsed(loadDfsUsed())
.build();
// 初始化ForkJoinPool
if (addReplicaThreadPool == null) {
// initialize add replica fork join pool
initializeAddReplicaPool(conf, (FsDatasetImpl) volume.getDataset());
}
// Make the dfs usage to be saved during shutdown.
shutdownHook = new Runnable() {
@Override
public void run() {
if (!dfsUsedSaved) {
saveDfsUsed();
addReplicaThreadPool.shutdownNow();
}
}
};
ShutdownHookManager.get().addShutdownHook(shutdownHook,
SHUTDOWN_HOOK_PRIORITY);
}
主要方法如下: