FairScheduler源码剪辑

最新推荐文章于 2019-04-15 11:42:46 发布

知乎哲也

最新推荐文章于 2019-04-15 11:42:46 发布

阅读量490

点赞数

本文链接：https://blog.csdn.net/lizhe10177/article/details/24662023

版权

源码1：调度器的调度方法，FairScheduler.assignTasks()

1，先判断在满足均衡条件下是否可以给TaskTracker分配任务。loadMgr.canAssignMap(tracker, runnableMaps, totalMapSlots) :

2，将作业按照缺额大小排序，选择缺额大的作业的Task给TaskTracker运行。

  public synchronized List<Task> assignTasks(TaskTrackerStatus tracker)
      throws IOException {
    if (!initialized) // Don't try to assign tasks if we haven't yet started up
      return null;
    
    // Reload allocations file if it hasn't been loaded in a while
    poolMgr.reloadAllocsIfNecessary();
    
    // Compute total runnable maps and reduces
    int runnableMaps = 0;
    int runnableReduces = 0;
    for (JobInProgress job: infos.keySet()) {
      runnableMaps += runnableTasks(job, TaskType.MAP);
      runnableReduces += runnableTasks(job, TaskType.REDUCE);
    }

    ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus();
    // Compute total map/reduce slots
    // In the future we can precompute this if the Scheduler becomes a 
    // listener of tracker join/leave events.
    int totalMapSlots = getTotalSlots(TaskType.MAP, clusterStatus);
    int totalReduceSlots = getTotalSlots(TaskType.REDUCE, clusterStatus);
    
    // Scan to see whether any job needs to run a map, then a reduce
    ArrayList<Task> tasks = new ArrayList<Task>();
    TaskType[] types = new TaskType[] {TaskType.MAP, TaskType.REDUCE};
    for (TaskType taskType: types) {
      boolean canAssign = (taskType == TaskType.MAP) ? 
          loadMgr.canAssignMap(tracker, runnableMaps, totalMapSlots) :
          loadMgr.canAssignReduce(tracker, runnableReduces, totalReduceSlots);
      if (canAssign) {
        // Figure out the jobs that need this type of task
        List<JobInProgress> candidates = new ArrayList<JobInProgress>();
        for (JobInProgress job: infos.keySet()) {
          if (job.getStatus().getRunState() == JobStatus.RUNNING && 
              neededTasks(job, taskType) > 0) {
            candidates.add(job);
          }
        }
        // Sort jobs by deficit (for Fair Sharing) or submit time (for FIFO)
        Comparator<JobInProgress> comparator = useFifo ?
            new FifoJobComparator() : new DeficitComparator(taskType);
        Collections.sort(candidates, comparator);
        for (JobInProgress job: candidates) {
          Task task = (taskType == TaskType.MAP ? 
              taskSelector.obtainNewMapTask(tracker, job) :
              taskSelector.obtainNewReduceTask(tracker, job));
          if (task != null) {
            // Update the JobInfo for this job so we account for the launched
            // tasks during this update interval and don't try to launch more
            // tasks than the job needed on future heartbeats
            JobInfo info = infos.get(job);
            if (taskType == TaskType.MAP) {
              info.runningMaps++;
              info.neededMaps--;
            } else {
              info.runningReduces++;
              info.neededReduces--;
            }
            tasks.add(task);
            if (!assignMultiple)
              return tasks;
            break;
          }
        }
      }
    }
    
    // If no tasks were found, return null
    return tasks.isEmpty() ? null : tasks;
  }

源码2 对作业的排序，FairScheduler::DeficitComparator.compare()

用对对两个作业按照缺额排序

1，如果其中一个作业还没满足最小资源保证，而另一个满足，则前者排序优先

2，如果两个都没满足最小资源保证或者两个都满足了最小资源保证，则看缺额量，缺额大的排序优先

    public int compare(JobInProgress j1, JobInProgress j2) {
      // Put needy jobs ahead of non-needy jobs (where needy means must receive
      // new tasks to meet slot minimum), comparing among jobs of the same type
      // by deficit so as to put jobs with higher deficit ahead.
      JobInfo j1Info = infos.get(j1);
      JobInfo j2Info = infos.get(j2);
      long deficitDif;
      boolean j1Needy, j2Needy;
      if (taskType == TaskType.MAP) {
        j1Needy = j1.runningMaps() < Math.floor(j1Info.minMaps);
        j2Needy = j2.runningMaps() < Math.floor(j2Info.minMaps);
        deficitDif = j2Info.mapDeficit - j1Info.mapDeficit;
      } else {
        j1Needy = j1.runningReduces() < Math.floor(j1Info.minReduces);
        j2Needy = j2.runningReduces() < Math.floor(j2Info.minReduces);
        deficitDif = j2Info.reduceDeficit - j1Info.reduceDeficit;
      }
      if (j1Needy && !j2Needy)
        return -1;
      else if (j2Needy && !j1Needy)
        return 1;
      else // Both needy or both non-needy; compare by deficit
        return (int) Math.signum(deficitDif);
    }
  }

代码3 作业信息更新：UpdateThead.run()

    public void run() {
      while (running) {
        try {
          Thread.sleep(UPDATE_INTERVAL);
          update();
        } catch (Exception e) {
          LOG.error("Failed to update fair share calculations", e);
        }
      }
    }

*1 更新作业可运行性

*2 更新Task数据

*3 更新权重

*4 更新最小资源量

*5 更新公平量

*6 更新缺额

  protected void update() {
    //Making more granual locking so that clusterStatus can be fetched from Jobtracker.
    ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus();
    // Got clusterStatus hence acquiring scheduler lock now
    // Remove non-running jobs
    synchronized(this){
      List<JobInProgress> toRemove = new ArrayList<JobInProgress>();
      for (JobInProgress job: infos.keySet()) { 
        int runState = job.getStatus().getRunState();
        if (runState == JobStatus.SUCCEEDED || runState == JobStatus.FAILED
          || runState == JobStatus.KILLED) {
            toRemove.add(job);
        }
      }
      for (JobInProgress job: toRemove) {
        infos.remove(job);
        poolMgr.removeJob(job);
      }
      // Update running jobs with deficits since last update, and compute new
      // slot allocations, weight, shares and task counts
      long now = clock.getTime();
      long timeDelta = now - lastUpdateTime;
      updateDeficits(timeDelta);
      updateRunnability();
      updateTaskCounts();
      updateWeights();
      updateMinSlots();
      updateFairShares(clusterStatus);
      lastUpdateTime = now;
    }
  }

代码4 更新作业可运行性：FairScheduler.updateRnnability()

由于每个用户都有作业数目限制，每个pool也有作业数目限制，如果运行某个作业没有超出这两个限制，则该作业可以运行（即可被调度）。

  private void updateRunnability() {
    // Start by marking everything as not runnable
    for (JobInfo info: infos.values()) {
      info.runnable = false;
    }
    // Create a list of sorted jobs in order of start time and priority
    List<JobInProgress> jobs = new ArrayList<JobInProgress>(infos.keySet());
    Collections.sort(jobs, new FifoJobComparator());
    // Mark jobs as runnable in order of start time and priority, until
    // user or pool limits have been reached.
    Map<String, Integer> userJobs = new HashMap<String, Integer>();
    Map<String, Integer> poolJobs = new HashMap<String, Integer>();
    for (JobInProgress job: jobs) {
      if (job.getStatus().getRunState() == JobStatus.RUNNING) {
        String user = job.getJobConf().getUser();
        String pool = poolMgr.getPoolName(job);
        int userCount = userJobs.containsKey(user) ? userJobs.get(user) : 0;
        int poolCount = poolJobs.containsKey(pool) ? poolJobs.get(pool) : 0;
        if (userCount < poolMgr.getUserMaxJobs(user) && 
            poolCount < poolMgr.getPoolMaxJobs(pool)) {
          infos.get(job).runnable = true;
          userJobs.put(user, userCount + 1);
          poolJobs.put(pool, poolCount + 1);
        }
      }
    }
  }

作业的排序对计算可运行性有影响，排序所用到比较器先比较优先级，再比较作业到达时间

public class FifoJobComparator implements Comparator<JobInProgress> {
  public int compare(JobInProgress j1, JobInProgress j2) {
    int res = j1.getPriority().compareTo(j2.getPriority());
    if (res == 0) {
      if (j1.getStartTime() < j2.getStartTime()) {
        res = -1;
      } else {
        res = (j1.getStartTime() == j2.getStartTime() ? 0 : 1);
      }
    }
    if (res == 0) {
      res = j1.hashCode() - j2.hashCode();
    }
    return res;
  }
}

代码5 更新task计数：FairScheduler.updateTaskCounts()

作业的总task数（totalMaps）包括完成的（finishedMaps），正在运行的（runningMaps），需要调度的（totalMaps - runningMaps - finishedMaps）

  private void updateTaskCounts() {
    for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
      JobInProgress job = entry.getKey();
      JobInfo info = entry.getValue();
      if (job.getStatus().getRunState() != JobStatus.RUNNING)
        continue; // Job is still in PREP state and tasks aren't initialized
      // Count maps
      int totalMaps = job.numMapTasks;
      int finishedMaps = 0;
      int runningMaps = 0;
      for (TaskInProgress tip: job.getMapTasks()) {
        if (tip.isComplete()) {
          finishedMaps += 1;
        } else if (tip.isRunning()) {
          runningMaps += tip.getActiveTasks().size();
        }
      }
      info.runningMaps = runningMaps;
      info.neededMaps = (totalMaps - runningMaps - finishedMaps
          + taskSelector.neededSpeculativeMaps(job));
      // Count reduces
      int totalReduces = job.numReduceTasks;
      int finishedReduces = 0;
      int runningReduces = 0;
      for (TaskInProgress tip: job.getReduceTasks()) {
        if (tip.isComplete()) {
          finishedReduces += 1;
        } else if (tip.isRunning()) {
          runningReduces += tip.getActiveTasks().size();
        }
      }
      info.runningReduces = runningReduces;
      if (enoughMapsFinishedToRunReduces(finishedMaps, totalMaps)) {
        info.neededReduces = (totalReduces - runningReduces - finishedReduces 
            + taskSelector.neededSpeculativeReduces(job));
      } else {
        info.neededReduces = 0;
      }
      // If the job was marked as not runnable due to its user or pool having
      // too many active jobs, set the neededMaps/neededReduces to 0. We still
      // count runningMaps/runningReduces however so we can give it a deficit.
      if (!info.runnable) {
        info.neededMaps = 0;
        info.neededReduces = 0;
      }
    }
  }

代码6 更新权重：FairScheduler.updateWeights()

首先对每个作业计算原始权重

然后对于每个作业info.mapWeight *= (poolWeight / mapWeightSum);其中poolWeight为Hadoop配置文件配置的权重，mapWeightSum即为pool中包含作业的原始权重之和。

  private void updateWeights() {
    // First, calculate raw weights for each job
    for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
      JobInProgress job = entry.getKey();
      JobInfo info = entry.getValue();
      info.mapWeight = calculateRawWeight(job, TaskType.MAP);
      info.reduceWeight = calculateRawWeight(job, TaskType.REDUCE);
    }
    // Now calculate job weight sums for each pool
    Map<String, Double> mapWeightSums = new HashMap<String, Double>();
    Map<String, Double> reduceWeightSums = new HashMap<String, Double>();
    for (Pool pool: poolMgr.getPools()) {
      double mapWeightSum = 0;
      double reduceWeightSum = 0;
      for (JobInProgress job: pool.getJobs()) {
        if (isRunnable(job)) {
          if (runnableTasks(job, TaskType.MAP) > 0) {
            mapWeightSum += infos.get(job).mapWeight;
          }
          if (runnableTasks(job, TaskType.REDUCE) > 0) {
            reduceWeightSum += infos.get(job).reduceWeight;
          }
        }
      }
      mapWeightSums.put(pool.getName(), mapWeightSum);
      reduceWeightSums.put(pool.getName(), reduceWeightSum);
    }
    // And normalize the weights based on pool sums and pool weights
    // to share fairly across pools (proportional to their weights)
    for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
      JobInProgress job = entry.getKey();
      JobInfo info = entry.getValue();
      String pool = poolMgr.getPoolName(job);
      double poolWeight = poolMgr.getPoolWeight(pool);
      double mapWeightSum = mapWeightSums.get(pool);
      double reduceWeightSum = reduceWeightSums.get(pool);
      if (mapWeightSum == 0)
        info.mapWeight = 0;
      else
        info.mapWeight *= (poolWeight / mapWeightSum); 
      if (reduceWeightSum == 0)
        info.reduceWeight = 0;
      else
        info.reduceWeight *= (poolWeight / reduceWeightSum); 
    }
  }

原始权重计算

作业的初始权值为1，如果Hadoop配置文件配置了基于作业大小计算权重，则初始权重为Math.log1p(runnableTasks(job, taskType)) / Math.log(2)，其中runnableTasks(job, taskType)返回的是作业需要调度的和正在运行的task之和。如果用户自定义了权重调整类，则还需要根据该类的调整方法再次调整权值。

  private double calculateRawWeight(JobInProgress job, TaskType taskType) {
    if (!isRunnable(job)) {
      return 0;
    } else {
      double weight = 1.0;
      if (sizeBasedWeight) {
        // Set weight based on runnable tasks
        weight = Math.log1p(runnableTasks(job, taskType)) / Math.log(2);
      }
      weight *= getPriorityFactor(job.getPriority());
      if (weightAdjuster != null) {
        // Run weight through the user-supplied weightAdjuster
        weight = weightAdjuster.adjustWeight(job, taskType, weight);
      }
      return weight;
    }
  }

代码7 更新最小资源量 FairScheduler.updateMinSlots()

对于每个pool

1，根据配置文件得到slot数量

2，计算pool中所有作业的权重（综合权重）

3，作业的权重占总权重的比例乘以slot数量向下取整即为最小资源量。

4，分配最小资源

int share = (int) Math.floor(oldSlots * weight / totalWeight);

slotsLeft = giveMinSlots(job, type, slotsLeft, share);

如果经过上述slot分配后slot值不变，则改变策略，将第三步作业的权重占总权重的比例乘以slot数量向下取整改为想上取整，另外在这之前加一个按权重排序过程

giveMinSlots能够保证分配给作业的slot数不会超过该作业的runnbaleTask数量

  private void updateMinSlots() {
    // Clear old minSlots
    for (JobInfo info: infos.values()) {
      info.minMaps = 0;
      info.minReduces = 0;
    }
    // For each pool, distribute its task allocation among jobs in it that need
    // slots. This is a little tricky since some jobs in the pool might not be
    // able to use all the slots, e.g. they might have only a few tasks left.
    // To deal with this, we repeatedly split up the available task slots
    // between the jobs left, give each job min(its alloc, # of slots it needs),
    // and redistribute any slots that are left over between jobs that still
    // need slots on the next pass. If, in total, the jobs in our pool don't
    // need all its allocation, we leave the leftover slots for general use.
    PoolManager poolMgr = getPoolManager();
    for (Pool pool: poolMgr.getPools()) {
      for (final TaskType type: TaskType.values()) {
        Set<JobInProgress> jobs = new HashSet<JobInProgress>(pool.getJobs());
        int slotsLeft = poolMgr.getAllocation(pool.getName(), type);
        // Keep assigning slots until none are left
        while (slotsLeft > 0) {
          // Figure out total weight of jobs that still need slots
          double totalWeight = 0;
          for (Iterator<JobInProgress> it = jobs.iterator(); it.hasNext();) {
            JobInProgress job = it.next();
            if (isRunnable(job) &&
                runnableTasks(job, type) > minTasks(job, type)) {
              totalWeight += weight(job, type);
            } else {
              it.remove();
            }
          }
          if (totalWeight == 0) // No jobs that can use more slots are left 
            break;
          // Assign slots to jobs, using the floor of their weight divided by
          // total weight. This ensures that all jobs get some chance to take
          // a slot. Then, if no slots were assigned this way, we do another
          // pass where we use ceil, in case some slots were still left over.
          int oldSlots = slotsLeft; // Copy slotsLeft so we can modify it
          for (JobInProgress job: jobs) {
            double weight = weight(job, type);
            int share = (int) Math.floor(oldSlots * weight / totalWeight);
            slotsLeft = giveMinSlots(job, type, slotsLeft, share);
          }
          if (slotsLeft == oldSlots) {
            // No tasks were assigned; do another pass using ceil, giving the
            // extra slots to jobs in order of weight then deficit
            List<JobInProgress> sortedJobs = new ArrayList<JobInProgress>(jobs);
            Collections.sort(sortedJobs, new Comparator<JobInProgress>() {
              public int compare(JobInProgress j1, JobInProgress j2) {
                double dif = weight(j2, type) - weight(j1, type);
                if (dif == 0) // Weights are equal, compare by deficit 
                  dif = deficit(j2, type) - deficit(j1, type);
                return (int) Math.signum(dif);
              }
            });
            for (JobInProgress job: sortedJobs) {
              double weight = weight(job, type);
              int share = (int) Math.ceil(oldSlots * weight / totalWeight);
              slotsLeft = giveMinSlots(job, type, slotsLeft, share);
            }
            if (slotsLeft > 0) {
              LOG.warn("Had slotsLeft = " + slotsLeft + " after the final "
                  + "loop in updateMinSlots. This probably means some fair "
                  + "scheduler weights are being set to NaN or Infinity.");
            }
            break;
          }
        }
      }
    }
  }

分配最小份额算法，他能保证分给作业的slot不超过runnableTask数量

  private int giveMinSlots(JobInProgress job, TaskType type,
      int slotsLeft, int slotsToGive) {
    int runnable = runnableTasks(job, type);
    int curMin = minTasks(job, type);
    slotsToGive = Math.min(Math.min(slotsLeft, runnable - curMin), slotsToGive);
    slotsLeft -= slotsToGive;
    JobInfo info = infos.get(job);
    if (type == TaskType.MAP)
      info.minMaps += slotsToGive;
    else
      info.minReduces += slotsToGive;
    return slotsLeft;
  }

代码8 更新公平份额 FairScheduler.updateFairShare()

1，获得集群总的slot

2，根据作业在集群中权重比重分配公平份额，根据公平份额是否大于最小份额将作业分为两类，一类为公平份额小于最小份额，另一类不小于最小份额

3，对于第一类作业分配给他的公平份额即为最小份额

4，只有第一类作业全部得到分配后才进行第二类做业的分配

  private void updateFairShares(ClusterStatus clusterStatus) {
    // Clear old fairShares
    for (JobInfo info: infos.values()) {
      info.mapFairShare = 0;
      info.reduceFairShare = 0;
    }
    // Assign new shares, based on weight and minimum share. This is done
    // as follows. First, we split up the available slots between all
    // jobs according to weight. Then if there are any jobs whose minSlots is
    // larger than their fair allocation, we give them their minSlots and
    // remove them from the list, and start again with the amount of slots
    // left over. This continues until all jobs' minSlots are less than their
    // fair allocation, and at this point we know that we've met everyone's
    // guarantee and we've split the excess capacity fairly among jobs left.
    for (TaskType type: TaskType.values()) {
      // Select only jobs that still need this type of task
      HashSet<JobInfo> jobsLeft = new HashSet<JobInfo>();
      for (Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
        JobInProgress job = entry.getKey();
        JobInfo info = entry.getValue();
        if (isRunnable(job) && runnableTasks(job, type) > 0) {
          jobsLeft.add(info);
        }
      }
      double slotsLeft = getTotalSlots(type, clusterStatus);
      while (!jobsLeft.isEmpty()) {
        double totalWeight = 0;
        for (JobInfo info: jobsLeft) {
          double weight = (type == TaskType.MAP ?
              info.mapWeight : info.reduceWeight);
          totalWeight += weight;
        }
        boolean recomputeSlots = false;
        double oldSlots = slotsLeft; // Copy slotsLeft so we can modify it
        for (Iterator<JobInfo> iter = jobsLeft.iterator(); iter.hasNext();) {
          JobInfo info = iter.next();
          double minSlots = (type == TaskType.MAP ?
              info.minMaps : info.minReduces);
          double weight = (type == TaskType.MAP ?
              info.mapWeight : info.reduceWeight);
          double fairShare = weight / totalWeight * oldSlots;
          if (minSlots > fairShare) {
            // Job needs more slots than its fair share; give it its minSlots,
            // remove it from the list, and set recomputeSlots = true to 
            // remember that we must loop again to redistribute unassigned slots
            if (type == TaskType.MAP)
              info.mapFairShare = minSlots;
            else
              info.reduceFairShare = minSlots;
            slotsLeft -= minSlots;
            iter.remove();
            recomputeSlots = true;
          }
        }
        if (!recomputeSlots) {
          // All minimums are met. Give each job its fair share of excess slots.
          for (JobInfo info: jobsLeft) {
            double weight = (type == TaskType.MAP ?
                info.mapWeight : info.reduceWeight);
            double fairShare = weight / totalWeight * oldSlots;
            if (type == TaskType.MAP)
              info.mapFairShare = fairShare;
            else
              info.reduceFairShare = fairShare;
          }
          break;
        }
      }
    }
  }

代码9 更新缺额：FairScheduler.updateDeficit()

应该分配的份额减去实际得到的份额即为缺额。info.mapFairShare存储了按照FairScheduler算法应该分配给作业的份额（可理解为slot数量），而实际分配的则是info.runningMaps（等于占用的slot数量）。

  private void updateDeficits(long timeDelta) {
    for (JobInfo info: infos.values()) {
      info.mapDeficit +=
        (info.mapFairShare - info.runningMaps) * timeDelta;
      info.reduceDeficit +=
        (info.reduceFairShare - info.runningReduces) * timeDelta;
    }
  }