/**
* struct drm_gpu_scheduler
*
* @ops: backend operations provided by the driver.
* @hw_submission_limit: the max size of the hardware queue.
* @timeout: the time after which a job is removed from the scheduler.
* @name: name of the ring for which this scheduler is being used.
* @sched_rq: priority wise array of run queues.
* @wake_up_worker: the wait queue on which the scheduler sleeps until a job
* is ready to be scheduled.
* @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
* waits on this wait queue until all the scheduled jobs are
* finished.
* @hw_rq_count: the number of jobs currently in the hardware queue.
* @job_id_count: used to assign unique id to the each job.
* @thread: the kthread on which the scheduler which run.
* @ring_mirror_list: the list of jobs which are currently in the job queue.
* @job_list_lock: lock to protect the ring_mirror_list.
* @hang_limit: once the hangs by a job crosses this limit then it is marked
* guilty and it will be considered for scheduling further.
*
* One scheduler is implemented for each hardware ring.
*/
struct drm_gpu_scheduler {//调度器定义,一个调度器有一个调度数组
const struct drm_sched_backend_ops *ops;
uint32_t hw_submission_limit;
long timeout;
const char *name;
struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_MAX];//5个优先级数组
wait_queue_head_t wake_up_worker;
wait_queue_head_t job_scheduled;
atomic_t hw_rq_count;
atomic64_t job_id_count;
struct task_struct *thread;//调度器对应的main线程
struct list_head ring_mirror_list;
spinlock_t job_list_lock;
int hang_limit;
};
enum drm_sched_priority {
DRM_SCHED_PRIORITY_MIN,
DRM_SCHED_PRIORITY_LOW = DRM_SCHED_PRIORITY_MIN,
DRM_SCHED_PRIORITY_NORMAL,
DRM_SCHED_PRIORITY_HIGH_SW,
DRM_SCHED_PRIORITY_HIGH_HW,
DRM_SCHED_PRIORITY_KERNEL,
DRM_SCHED_PRIORITY_MAX,--------------5
DRM_SCHED_PRIORITY_INVALID = -1,
DRM_SCHED_PRIORITY_UNSET = -2
};
hw_submission_limit
字段,用来设定对应hardware run queue中最大支持的任务数。在当前提交的任务数小于这个上限的时候认为这个调度器是就绪状态
/**
* struct drm_sched_backend_ops
*
* Define the backend operations called by the scheduler,
* these functions should be implemented in driver side.
*/
struct drm_sched_backend_ops {
/**
* @dependency: Called when the scheduler is considering scheduling
* this job next, to get another struct dma_fence for this job to
* block on. Once it returns NULL, run_job() may be called.
*/
struct dma_fence *(*dependency)(struct drm_sched_job *sched_job,
struct drm_sched_entity *s_entity);
/**
* @run_job: Called to execute the job once all of the dependencies
* have been resolved. This may be called multiple times, if
* timedout_job() has happened and drm_sched_job_recovery()
* decides to try it again.
*/
struct dma_fence *(*run_job)(struct drm_sched_job *sched_job);
/**
* @timedout_job: Called when a job has taken too long to execute,
* to trigger GPU recovery.
*/
void (*timedout_job)(struct drm_sched_job *sched_job);
/**
* @free_job: Called once the job's finished fence has been signaled
* and it's time to clean it up.
*/
void (*free_job)(struct drm_sched_job *sched_job);
};
在 Linux 内核中,`struct drm_sched_backend_ops` 结构体是 DRM 调度器后端所定义的函数指针。它定义和实现了图形设备驱动的任务调度和管理功能,从而提高了驱动程序的性能和效率。
下面是在 Linux 内核下,`struct drm_sched_backend_ops` 结构体中四个函数的详细解释:
1. struct dma_fence* (*dependency)(struct drm_sched_job *sched_job, struct drm_sched_entity *s_entity)
dependency 函数用于为当前任务获取另一个 dma_fence(直接汉译:DMA 栅栏)。在 DRM 调度器中,任务可能会因为等待其他任务而被阻塞。`dependency` 函数的功能就是为一个新的任务查找当前的所有 `dma_fence`,确定需要等待哪些任务,并返回一个新的 `dma_fence`,使该任务能够在它所需要等待的任务完成后继续执行。
2. struct dma_fence* (*run_job)(struct drm_sched_job *sched_job)
run_job函数用于执行一个任务,如果任务成功执行,则返回一个 dma_fence。在调用这个函数之前,需要使用 `dependency` 函数获取所有任务的 dma_fence。
3. void (*timedout_job)(struct drm_sched_job *sched_job)
timedout_job 函数是在任务执行时间超过指定持续时间时调用的。如果在指定的时间内任务未能完成,该函数就会触发 GPU 恢复机制,以保护系统免受 GPU 负载过高的影响。
4. `void (*free_job)(struct drm_sched_job *sched_job)`
`free_job` 函数在任务完成后,用于释放所有相关资源。任务完成后,这个函数负责释放任务相关的内存和共享资源,从而确保系统的稳定性和安全性。
综上所述,这四个函数分别用于任务的依赖关系处理、任务执行、超时处理和资源释放等方面,是实现 DRM 调度器后端的关键函数。通过 `struct drm_sched_backend_ops` 结构体定义的这些函数,可以更加有效地管理驱动程序的任务调度和执行。
如何初始化一个调度器的实例呢
/**
* drm_sched_init - Init a gpu scheduler instance
*
* @sched: scheduler instance
* @ops: backend operations for this scheduler
* @hw_submission: number of hw submissions that can be in flight
* @hang_limit: number of times to allow a job to hang before dropping it
* @timeout: timeout value in jiffies for the scheduler
* @name: name used for debugging
*
* Return 0 on success, otherwise error code.
*/
int drm_sched_init(struct drm_gpu_scheduler *sched,
const struct drm_sched_backend_ops *ops,
unsigned hw_submission,
unsigned hang_limit,
long timeout,
const char *name)
{
int i;
sched->ops = ops;
sched->hw_submission_limit = hw_submission;
sched->name = name;
sched->timeout = timeout;
sched->hang_limit = hang_limit;
for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_MAX; i++)
drm_sched_rq_init(sched, &sched->sched_rq[i]);
init_waitqueue_head(&sched->wake_up_worker);
init_waitqueue_head(&sched->job_scheduled);
INIT_LIST_HEAD(&sched->ring_mirror_list);
spin_lock_init(&sched->job_list_lock);
atomic_set(&sched->hw_rq_count, 0);
atomic64_set(&sched->job_id_count, 0);
/* Each scheduler will run on a seperate kernel thread */
sched->thread = kthread_run(drm_sched_main, sched, sched->name);//启动一个线程
if (IS_ERR(sched->thread)) {
DRM_ERROR("Failed to create scheduler for %s.\n", name);
return PTR_ERR(sched->thread);
}
return 0;
}
EXPORT_SYMBOL(drm_sched_init);
调度器调度的是什么:Hardware Run Queue,或者Hardware Ring Buffer。简而言之,就是一些硬件Command Ring类似的单元实际上是有限的资源,在同一时刻只能执行固定个数的任务,需要通过软件的手段调度这个硬件资源,让相关的任务排队,并均匀分配。
在drm_sched模块中,有以下映射关系:
- Hardware Run Queue与调度器一一对应
- 每个调度器分为多个优先级run queue
- 每个run queue中对调度实体entity进行调度
- 每个调度实体包含一个job queue
简单来说,DRM任务调度器接收调度实体,根据优先级按照轮转的方式将调度实体中的任务进行执行。这个过程发生在一个内核线程中,每一个调度器在初始化中都会创建自己的内核线程,内核线程的函数为drm_sched_main
。从框架角度来看,框架使用drm_sched_job
表示一个任务。
调度器由drm_gpu_scheduler
结构体表示,上层驱动代码可以通过注册一个drm_sched_backend_ops
特化一个调度器的行为,这个ops简单包括几个回调函数,因此调度器的实现是比较固定的。
/**
* drm_sched_main - main scheduler thread
*
* @param: scheduler instance
*
* Returns 0.
*/
static int drm_sched_main(void *param)//内核线程
{
struct sched_param sparam = {.sched_priority = 1};
struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
int r;
sched_setscheduler(current, SCHED_FIFO, &sparam);
while (!kthread_should_stop()) {
struct drm_sched_entity *entity = NULL;
struct drm_sched_fence *s_fence;
struct drm_sched_job *sched_job;
struct dma_fence *fence;
wait_event_interruptible(sched->wake_up_worker,
(!drm_sched_blocked(sched) &&
(entity = drm_sched_select_entity(sched))) || //判断是否有任务需要被调度
kthread_should_stop());
if (!entity)
continue;
sched_job = drm_sched_entity_pop_job(entity);//DRM Entity 中取出 Job
if (!sched_job)
continue;
s_fence = sched_job->s_fence;//从中提取出对应job的 Fence
atomic_inc(&sched->hw_rq_count);
drm_sched_job_begin(sched_job);
fence = sched->ops->run_job(sched_job);// GPU 调度器的 run_job 函数来执行该 Job
drm_sched_fence_scheduled(s_fence);//让硬件执行job,所以设置成scheduled
if (fence) {
s_fence->parent = dma_fence_get(fence);
r = dma_fence_add_callback(fence, &s_fence->cb,
drm_sched_process_job);//如果 Fence 不为空,则需要为其添加一条 Callback (回调函数),
//并将该 Fence 的指针存储到 Fence 结构体中的 parent 成员变量中(这个 parent 变量用于记录本次提交的任务所对应的 Fence 在 GPU 中的位置)
if (r == -ENOENT)
drm_sched_process_job(fence, &s_fence->cb);
else if (r)
DRM_ERROR("fence add callback failed (%d)\n",
r);
dma_fence_put(fence);
} else {
drm_sched_process_job(NULL, &s_fence->cb);
}
wake_up(&sched->job_scheduled);//唤醒任何正在等待调度任务的实体。然后继续下一轮循环,等待下一次调度请求
}
return 0;
}
drm_sched_main
实际上就是上面提到的内核线程的执行函数。
函数开头可以看到将这个内核线程的调度策略设置成了SCHED_FIFO
,并将优先级设置成1。随后函数进入了任务处理循环,可以看到drm_gpu_scheduler
中提前准备好了,一个wait queue,名为wake_up_worker
。函数首先在这个wait queue上以如下条件进行等待:
- cleanup_job = drm_sched_get_cleanup_job(sched)) 不为NULL
- !drm_sched_blocked(sched) && (entity = drm_sched_select_entity(sched)))
- kthread_should_stop()
在上面任意一个条件满足时,内核线程即唤醒,然后继续执行任务。如果cleanup_job不为NULL,则进行如下操作:
if (cleanup_job) {
sched->ops->free_job(cleanup_job);
/* queue timeout for next job */
drm_sched_start_timeout(sched);
}
随后函数检查entity是否为NULL,不为NULL则根据entity拿到下一个应该执行的job:
sched_job = drm_sched_entity_pop_job(entity);
if (!sched_job)
continue;
随后函数执行以下语句:
atomic_inc(&sched->hw_rq_count);
drm_sched_job_begin(sched_job);
这两个操作分别是:
- 增加
hw_rq_count
计数器,该计数器标志已经压入hardware run queue的任务的个数 - 将任务从调度实体的队列中拿下,并放到
ring_mirror_list
中,表示它正在被hardware run queue执行。同时,使能timeout定时器,防止job超时
准备工作做完后,直接上层驱动代码注册的run_job
回调函数,执行任务:
fence = sched->ops->run_job(sched_job);-------------调用硬件执行job任务
drm_sched_fence_scheduled(s_fence);
注意这里的先后顺序,run_job仅仅是将任务压到hardware run queue,此时需要等待硬件执行,所以此时直接触发scheduled
的fence而不是finished
。注意run_job
的返回值也是一个fence,这个fence被触发时,即表明hardware run queue上该job被硬件执行完毕。因此,函数需要在该fence上注册回调函数,当job在硬件上完成后,调用drm_sched_process_job
函数,进行:
- 计数器更新,hw_rq_count以及num_jobs
- 触发finished fence
- 唤醒内核线程工作(有job处理完毕表示hardware run queue有新空位了)
最后,函数唤醒job_scheduled
等待队列,表示有新job推到hardware run queue上了
参考链接: