2021SC@SDUSC
Hadoop yarn源码分析(九) Container源码解析 2021SC@SDUSC
一、Container简介
Container是yarn中一个虚拟出来的概念。Container是yarn中一个动态资源分配的概念,拥有一定的内存,核数,由RM分配给AM或Map Task或Reduce Task,之后,Appliacation便可以在以Container为基础的容器中运行。
Container的启动是由ContainersLauncher服务来完成的,它是Containers的启动程序,它维护了一个线程池,而且以并行的方式完成Container的相关操作,比如启动(start)或杀死(kill)Container。必须ResourceLocalizationService启动之后,才能启动ContainersLauncher服务。 因为它依赖于本地文件系统上创建系统目录。
ContainerExecutor是在底层操作系统上启动container的机制的抽象类,所有的executor必须继承自ContainerExecutor。ContainerExecutor可与底层操作系统进行交互, 安全存放Container需要的文件和目录, 进而以一种安全的方式启动和清除Container对应的进程。
二、Container属性
2.1 ContainersLauncher属性
ContainersLauncher基本属性
//org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher.java
//上下文内容
private Context context;
//ContainerExecutor
private ContainerExecutor exec;
//调度器
private Dispatcher dispatcher;
private ContainerManagerImpl containerManager;
private LocalDirsHandlerService dirsHandler;
//启动container的线程池
@VisibleForTesting
public ExecutorService containerLauncher =
HadoopExecutors.newCachedThreadPool(
new ThreadFactoryBuilder()
.setNameFormat("ContainersLauncher #%d")
.build());
//运行的container集合
@VisibleForTesting
public final Map<ContainerId, ContainerLaunch> running =
Collections.synchronizedMap(new HashMap<ContainerId, ContainerLaunch>());
serviceInit初始化方法
@Override
protected void serviceInit(Configuration conf) throws Exception {
try {
//传入conf参数,重新加载文件系统的配置
//TODO Is this required?
FileContext.getLocalFSFileContext(conf);
} catch (UnsupportedFileSystemException e) {
throw new YarnRuntimeException("Failed to start ContainersLauncher", e);
}
super.serviceInit(conf);
}
2.2 ContainerExecutor属性
//org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.java
//通配符
protected static final String WILDCARD = "*";
public static final String TOKEN_FILE_NAME_FMT = "%s.tokens";
//创建启动脚本时要使用的权限,700
public static final FsPermission TASK_LAUNCH_SCRIPT_PERMISSION =
FsPermission.createImmutable((short)0700);
//将调试信息写入相对路径
public static final String DIRECTORY_CONTENTS = "directory.info";
//配置信息
private Configuration conf;
//ContainerId存储路径
private final ConcurrentMap<ContainerId, Path> pidFiles =
new ConcurrentHashMap<>();
//可重入读写锁
private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
白名单变量,用户可以自定义设置的环境变量, 当用户指定的时候, 不再使用NodeManager环境的默认值
private String[] whitelistVars;
//退出的暂停时间
private int exitCodeFileTimeout =
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT;
三、Container基本方法
3.1 ContainersLauncher方法
3.1.1 handle()方法
在该方法中,通过对传入event的type进行判断,从而对Container执行各种操作。
LAUNCH_CONTAINER : 启动Container
RELAUNCH_CONTAINER : 重新启动Container
RECOVER_CONTAINER : 异常恢复Container/ 重新提交.
RECOVER_PAUSED_CONTAINER : 恢复暂停的容器
CLEANUP_CONTAINER : 清理容器
CLEANUP_CONTAINER_FOR_REINIT : 清理容器,重新进行初始化
SIGNAL_CONTAINER : 向Container发送指令
PAUSE_CONTAINER:暂停Container
RESUME_CONTAINER:重新启动Container
@Override
public void handle(ContainersLauncherEvent event) {
//ContainersLauncher只能一个接一个地运行
Container container = event.getContainer();
ContainerId containerId = container.getContainerId();
//对事件类型进行判断
switch (event.getType()) {
//启动Container
case LAUNCH_CONTAINER:
Application app =
context.getApplications().get(
containerId.getApplicationAttemptId().getApplicationId());
ContainerLaunch launch =
new ContainerLaunch(context, getConfig(), dispatcher, exec, app,
event.getContainer(), dirsHandler, containerManager);
containerLauncher.submit(launch);
running.put(containerId, launch);
break;
//重新启动
case RELAUNCH_CONTAINER:
app = context.getApplications().get(
containerId.getApplicationAttemptId().getApplicationId());
ContainerRelaunch relaunch =
new ContainerRelaunch(context, getConfig(), dispatcher, exec, app,
event.getContainer(), dirsHandler, containerManager);
containerLauncher.submit(relaunch);
running.put(containerId, relaunch);
break;
//异常恢复,重新提交
case RECOVER_CONTAINER:
app = context.getApplications().get(
containerId.getApplicationAttemptId().getApplicationId());
launch = new RecoveredContainerLaunch(context, getConfig(), dispatcher,
exec, app, event.getContainer(), dirsHandler, containerManager);
containerLauncher.submit(launch);
running.put(containerId, launch);
break;
//恢复暂停的容器
case RECOVER_PAUSED_CONTAINER:
app = context.getApplications().get(
containerId.getApplicationAttemptId().getApplicationId());
launch = new RecoverPausedContainerLaunch(context, getConfig(),
dispatcher, exec, app, event.getContainer(), dirsHandler,
containerManager);
containerLauncher.submit(launch);
break;
//清理容器
case CLEANUP_CONTAINER:
cleanup(event, containerId, true);
break;
//清理容器,并进行初始化
case CLEANUP_CONTAINER_FOR_REINIT:
cleanup(event, containerId, false);
break;
//发送指令
case SIGNAL_CONTAINER:
SignalContainersLauncherEvent signalEvent =
(SignalContainersLauncherEvent) event;
ContainerLaunch runningContainer = running.get(containerId);
if (runningContainer == null) {
// Container并不运行,无事可做
LOG.info("Container " + containerId + " not running, nothing to signal.");
return;
}
try {
runningContainer.signalContainer(signalEvent.getCommand());
} catch (IOException e) {
LOG.warn("Got exception while signaling container " + containerId
+ " with command " + signalEvent.getCommand());
}
break;
//暂停
case PAUSE_CONTAINER:
ContainerLaunch launchedContainer = running.get(containerId);
if (launchedContainer == null) {
// Container not launched. So nothing needs to be done.
return;
}
//暂停container
try {
launchedContainer.pauseContainer();
} catch (Exception e) {
LOG.info("Got exception while pausing container: " +
StringUtils.stringifyException(e));
}
break;
//重新开始Container
case RESUME_CONTAINER:
ContainerLaunch launchCont = running.get(containerId);
if (launchCont == null) {
// Container not launched. So nothing needs to be done.
return;
}
//重新开始container.
try {
launchCont.resumeContainer();
} catch (Exception e) {
LOG.info("Got exception while resuming container: " +
StringUtils.stringifyException(e));
}
break;
}
}
3.1.2 Container启动方法
构建一个ContainerLaunch对象,然后交由containerLauncher线程池启动
Application app =
context.getApplications().get(
containerId.getApplicationAttemptId().getApplicationId());
// 构建ContainerLaunch 对象, 交由线程池执行.
ContainerLaunch launch =
new ContainerLaunch(context, getConfig(), dispatcher, exec, app,
event.getContainer(), dirsHandler, containerManager);
containerLauncher.submit(launch);
running.put(containerId, launch);
3.2 ContainerExecutor方法
3.2.1 setConf()方法
为参数conf赋值,并加载白名单whitelistVars列表
public void setConf(Configuration conf) {
this.conf = conf;
if (conf != null) {
//用户可以自定义设置的环境变量, 当用户指定的时候, 不再使用NodeManager环境的默认值
whitelistVars = conf.get(YarnConfiguration.NM_ENV_WHITELIST,
YarnConfiguration.DEFAULT_NM_ENV_WHITELIST).split(",");
exitCodeFileTimeout = conf.getInt(
YarnConfiguration.NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT,
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT);
}
}
3.2.2 getRunCommandForOther()方法
返回命令行以在OS shell中执行给定命令。
protected String[] getRunCommandForOther(String command,
Configuration config) {
//command列表
List<String> retCommand = new ArrayList<>();
//containerSchedPriorityIsSet标志位
boolean containerSchedPriorityIsSet = false;
int containerSchedPriorityAdjustment =
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_SCHED_PRIORITY;
if (config.get(YarnConfiguration.NM_CONTAINER_EXECUTOR_SCHED_PRIORITY) !=
null) {
containerSchedPriorityIsSet = true;
containerSchedPriorityAdjustment = config
.getInt(YarnConfiguration.NM_CONTAINER_EXECUTOR_SCHED_PRIORITY,
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_SCHED_PRIORITY);
}
if (containerSchedPriorityIsSet) {
retCommand.addAll(Arrays.asList("nice", "-n",
Integer.toString(containerSchedPriorityAdjustment)));
}
retCommand.addAll(Arrays.asList("bash", command));
//返回String数组
return retCommand.toArray(new String[retCommand.size()]);
}
3.2.3 DelayedProcessKiller类
继承Thread 类.,在ContainerExecutor中定义,此类将在指定的延迟后向目标容器发送信号。
run()方法
@Override
public void run() {
try {
Thread.sleep(delay);
// containerExecutor标志
containerExecutor.signalContainer(new ContainerSignalContext.Builder()
.setContainer(container)
.setUser(user)
.setPid(pid)
.setSignal(signal)
.build());
} catch (InterruptedException e) {
interrupt();
} catch (IOException e) {
//错误提示
String message = "Exception when user " + user + " killing task " + pid
+ " in DelayedProcessKiller: " + StringUtils.stringifyException(e);
LOG.warn(message);
container.handle(new ContainerDiagnosticsUpdateEvent(
container.getContainerId(), message));
}
}
}
resolveSymLinks()方法,重新创建解决一个指向目标的符号链接文件。
private Map<Path, Path> resolveSymLinks(Map<Path,
List<String>> resources, String user) {
Map<Path, Path> symLinks = new HashMap<>();
for (Map.Entry<Path, List<String>> resourceEntry :
resources.entrySet()) {
for (String linkName : resourceEntry.getValue()) {
if (new Path(linkName).getName().equals(WILDCARD)) {
//如果这是一个通配符路径,请从工作控制器链接到目录中的所有内容
for (File wildLink : readDirAsUser(user, resourceEntry.getKey())) {
symLinks.put(new Path(wildLink.toString()),
new Path(wildLink.getName()));
}
} else {
symLinks.put(resourceEntry.getKey(), new Path(linkName));
}
}
}
return symLinks;
}