MesosSchedulerDriver的代码在src/sched/sched.cpp里面实现。
Driver->run()调用start()
首先检测Mesos-Master的leader
创建一个线程。
SchedulerProcess的initialize()函数
里面主要注册消息处理函数。
-
virtual void initialize()
-
{
-
install<Event>(&SchedulerProcess::receive);
-
-
// TODO(benh): Get access to flags so that we can decide whether
-
// or not to make ZooKeeper verbose.
-
install<FrameworkRegisteredMessage>(
-
&SchedulerProcess::registered,
-
&FrameworkRegisteredMessage::framework_id,
-
&FrameworkRegisteredMessage::master_info);
-
-
install<FrameworkReregisteredMessage>(
-
&SchedulerProcess::reregistered,
-
&FrameworkReregisteredMessage::framework_id,
-
&FrameworkReregisteredMessage::master_info);
-
-
install<ResourceOffersMessage>(
-
&SchedulerProcess::resourceOffers,
-
&ResourceOffersMessage::offers,
-
&ResourceOffersMessage::pids);
-
-
install<RescindResourceOfferMessage>(
-
&SchedulerProcess::rescindOffer,
-
&RescindResourceOfferMessage::offer_id);
-
-
install<StatusUpdateMessage>(
-
&SchedulerProcess::statusUpdate,
-
&StatusUpdateMessage::update,
-
&StatusUpdateMessage::pid);
-
-
install<LostSlaveMessage>(
-
&SchedulerProcess::lostSlave,
-
&LostSlaveMessage::slave_id);
-
-
install<ExitedExecutorMessage>(
-
&SchedulerProcess::lostExecutor,
-
&ExitedExecutorMessage::executor_id,
-
&ExitedExecutorMessage::slave_id,
-
&ExitedExecutorMessage::status);
-
-
install<ExecutorToFrameworkMessage>(
-
&SchedulerProcess::frameworkMessage,
-
&ExecutorToFrameworkMessage::slave_id,
-
&ExecutorToFrameworkMessage::executor_id,
-
&ExecutorToFrameworkMessage::data);
-
-
install<FrameworkErrorMessage>(
-
&SchedulerProcess::error,
-
&FrameworkErrorMessage::message);
-
-
// Start detecting masters.
-
detector->detect()
-
.onAny(defer(self(), &SchedulerProcess::detected, lambda::_1));
-
}
|
在前面的文章中,Mesos源码分析(6): Mesos Master的初始化中,
Allocator的initialize函数中,传入的OfferCallback是Master::offer。
每过allocation_interval,Allocator都会计算每个framework的offer,然后依次调用Master::offer,将资源offer给相应的framework
在Master::offer函数中,生成如下的ResourceOffersMessage,并且发送给Framework。
对应到这里当Driver收到ResourceOffersMessage的消息的时候,会调用SchedulerProcess::resourceOffers
-
void resourceOffers(
-
const UPID& from,
-
const vector<Offer>& offers,
-
const vector<string>& pids)
-
{
-
……
-
VLOG(2) << "Received " << offers.size() << " offers";
-
……
-
scheduler->resourceOffers(driver, offers);
-
-
VLOG(1) << "Scheduler::resourceOffers took " << stopwatch.elapsed();
-
}
|
最终调用了Framework的resourceOffers。
Test Framework的resourceOffers函数,根据得到的offers,创建一系列tasks,然后调用driver的launchTasks函数
-
virtual void resourceOffers(SchedulerDriver* driver,
-
const vector<Offer>& offers)
-
{
-
foreach (const Offer& offer, offers) {
-
cout << "Received offer " << offer.id() << " with " << offer.resources()
-
<< endl;
-
-
static const Resources TASK_RESOURCES = Resources::parse(
-
"cpus:" + stringify(CPUS_PER_TASK) +
-
";mem:" + stringify(MEM_PER_TASK)).get();
-
-
Resources remaining = offer.resources();
-
-
// Launch tasks.
-
vector<TaskInfo> tasks;
-
while (tasksLaunched < totalTasks &&
-
remaining.flatten().contains(TASK_RESOURCES)) {
-
int taskId = tasksLaunched++;
-
-
cout << "Launching task " << taskId << " using offer "
-
<< offer.id() << endl;
-
-
TaskInfo task;
-
task.set_name("Task " + lexical_cast<string>(taskId));
-
task.mutable_task_id()->set_value(lexical_cast<string>(taskId));
-
task.mutable_slave_id()->MergeFrom(offer.slave_id());
-
task.mutable_executor()->MergeFrom(executor);
-
-
Option<Resources> resources =
-
remaining.find(TASK_RESOURCES.flatten(role));
-
-
CHECK_SOME(resources);
-
task.mutable_resources()->MergeFrom(resources.get());
-
remaining -= resources.get();
-
-
tasks.push_back(task);
-
}
-
-
driver->launchTasks(offer.id(), tasks);
-
}
-
}
|
SchedulerProcess的launchTasks函数实现如下:
-
void launchTasks(const vector<OfferID>& offerIds,
-
const vector<TaskInfo>& tasks,
-
const Filters& filters)
-
{
-
Offer::Operation operation;
-
operation.set_type(Offer::Operation::LAUNCH);
-
-
Offer::Operation::Launch* launch = operation.mutable_launch();
-
foreach (const TaskInfo& task, tasks) {
-
launch->add_task_infos()->CopyFrom(task);
-
}
-
-
acceptOffers(offerIds, {operation}, filters);
-
}
-
-
void acceptOffers(
-
const vector<OfferID>& offerIds,
-
const vector<Offer::Operation>& operations,
-
const Filters& filters)
-
{
-
// TODO(jieyu): Move all driver side verification to master since
-
// we are moving towards supporting pure launguage scheduler.
-
-
if (!connected) {
-
VLOG(1) << "Ignoring accept offers message as master is disconnected";
-
-
// NOTE: Reply to the framework with TASK_LOST messages for each
-
// task launch. See details from notes in launchTasks.
-
foreach (const Offer::Operation& operation, operations) {
-
if (operation.type() != Offer::Operation::LAUNCH) {
-
continue;
-
}
-
-
foreach (const TaskInfo& task, operation.launch().task_infos()) {
-
StatusUpdate update = protobuf::createStatusUpdate(
-
framework.id(),
-
None(),
-
task.task_id(),
-
TASK_LOST,
-
TaskStatus::SOURCE_MASTER,
-
None(),
-
"Master disconnected",
-
TaskStatus::REASON_MASTER_DISCONNECTED);
-
-
statusUpdate(UPID(), update, UPID());
-
}
-
}
-
return;
-
}
-
-
Call call;
-
CHECK(framework.has_id());
-
call.mutable_framework_id()->CopyFrom(framework.id());
-
call.set_type(Call::ACCEPT);
-
-
Call::Accept* accept = call.mutable_accept();
-
-
// Setting accept.operations.
-
foreach (const Offer::Operation& _operation, operations) {
-
Offer::Operation* operation = accept->add_operations();
-
operation->CopyFrom(_operation);
-
}
-
-
// Setting accept.offer_ids.
-
foreach (const OfferID& offerId, offerIds) {
-
accept->add_offer_ids()->CopyFrom(offerId);
-
-
if (!savedOffers.contains(offerId)) {
-
// TODO(jieyu): A duplicated offer ID could also cause this
-
// warning being printed. Consider refine this message here
-
// and in launchTasks as well.
-
LOG(WARNING) << "Attempting to accept an unknown offer " << offerId;
-
} else {
-
// Keep only the slave PIDs where we run tasks so we can send
-
// framework messages directly.
-
foreach (const Offer::Operation& operation, operations) {
-
if (operation.type() != Offer::Operation::LAUNCH) {
-
continue;
-
}
-
-
foreach (const TaskInfo& task, operation.launch().task_infos()) {
-
const SlaveID& slaveId = task.slave_id();
-
-
if (savedOffers[offerId].contains(slaveId)) {
-
savedSlavePids[slaveId] = savedOffers[offerId][slaveId];
-
} else {
-
LOG(WARNING) << "Attempting to launch task " << task.task_id()
-
<< " with the wrong slave id " << slaveId;
-
}
-
}
-
}
-
}
-
-
// Remove the offer since we saved all the PIDs we might use.
-
savedOffers.erase(offerId);
-
}
-
-
// Setting accept.filters.
-
accept->mutable_filters()->CopyFrom(filters);
-
-
CHECK_SOME(master);
-
send(master.get().pid(), call);
-
}
|
最终向Mesos-Master的leader发送launchTasks的消息。