Linux CPUFreq模块
CPUFreq
简介
CPU的硬件特性决定了这个CPU的最高和最低工作频率,它们在sysfs文件系统中用cpuinfo_xxx_freq来表示;我们可以在这个范围内再次定义出一个软件的调节范围,它们用scaling_xxx_freq来表示;根据具体的硬件平台的不同,我们还需要提供一个频率表,这个频率表规定了cpu可以工作的频率值,其受限于cpuinfo_xxx_freq;通过上述数值,cpuFreq系统可以根据当前cpu的负载情况从频率表中选择一个合适的频率供cpu使用,从而实现性能与功耗的要求。
选择合适的频率需要使用governor来实现,对于硬件的操作使用scaling_driver来完成。CPUFreq将一些与平台和具体的调频策略无关的代码抽象出来,形成cpuFreq Core.在cpu中通过cpu_policy记录了当前cpu的一些硬件信息以及软件信息,包括对应的governor;内核中的其它模块需要得到通知信息时,则通过cpufreq notifiers来完成。
模块架构
- cpufreq core
- cpufreq governor
- cpufreq driver
代码
CPUFreq 核心数据结构
struct cpufreq_policy {
/* CPUs sharing clock, require sw coordination */
cpumask_var_t cpus; /* note:Online CPUs only ----------------------------------------------------*/
cpumask_var_t related_cpus; /* note:Online + Offline CPUs -----------------------------------------*/
cpumask_var_t real_cpus; /* Related and present */
unsigned int shared_type; /* ACPI: ANY or ALL affected CPUs
should set cpufreq */
unsigned int cpu; /* note:cpu managing this policy, must be online ---------------------------*/
struct clk *clk;
struct cpufreq_cpuinfo cpuinfo;/* note:see above 记录cpu硬件能够支持的最小和最大的频率以及切换延迟信息--------*/
unsigned int min; /* note:policy 能够使用的最小频率 in kHz -------------------------------------*/
unsigned int max; /* in kHz ------------------------------------------------------------------*/
unsigned int cur; /* in kHz, only needed if cpufreq governors are used -----------------------*/
unsigned int restore_freq; /* = policy->cur before transition */
unsigned int suspend_freq; /* freq to set during suspend */
unsigned int policy; /* see above 该变量只有在CPUFREQ_POLICY_POWERSAVE和CPUFREQ_POLICY_PERFORMANCE,该变量只有在当调频驱动支持setpolicy回调函数的时候有效,这时候驱动根据policy变量的值来决定系统的工作频率或者状态,如果调频驱动支持driver的target回调,则频率由相应的governor来决定*/
unsigned int last_policy; /* policy before unplug */
struct cpufreq_governor *governor; /* note:see below --------------------------------------------------*/
void *governor_data;/*note: 当前policy所使用的的cpufreq_governor的数据结构以及其上下文数据 -------*/
char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
struct work_struct update; /* note:if update_policy() needs to be called, but you're in IRQ context -*/
struct cpufreq_user_policy user_policy;/*因为特殊原因需要修改policy的参数,然后在适当的时候恢复原有的参数,如温控保护,使用user_policy保存原始的参数(min,max)-------------------------------------------------------------------*/
struct cpufreq_frequency_table *freq_table;/* note --------------------------------------------------*/
enum cpufreq_table_sorting freq_table_sorted;
struct list_head policy_list;
struct kobject kobj;/* note: 该policy在sysfs中对应的kobj对象---------------------------------------*/
struct completion kobj_unregister;
/*
* The rules for this semaphore:
* - Any routine that wants to read from the policy structure will
* do a down_read on this semaphore.
* - Any routine that will write to the policy structure and/or may take away
* the policy altogether (eg. CPU hotplug), will hold this lock in write
* mode before doing so.
*/
struct rw_semaphore rwsem;
/* note:
* Fast switch flags:
* - fast_switch_possible should be set by the driver if it can
* guarantee that frequency can be changed on any CPU sharing the
* policy and that the change will affect all of the policy CPUs then.
* - fast_switch_enabled is to be set by governors that support fast
* frequency switching with the help of cpufreq_enable_fast_switch().
*/
bool fast_switch_possible;/*----------------------------------------------------------------*/
bool fast_switch_enabled;
/* note
* Preferred average time interval between consecutive invocations(连续调用) of
* the driver to set the frequency for this policy. To be set by the
* scaling driver (0, which is the default, means no preference).
*/
unsigned int transition_delay_us;/*-------------------------------------------------------------*/
/*
* Remote DVFS flag (Not added to the driver structure as we don't want
* to access another structure from scheduler hotpath).
*
* Should be set if CPUs can do DVFS on behalf of other CPUs from
* different cpufreq policies.
*/
bool dvfs_possible_from_any_cpu;
/* Cached frequency lookup from cpufreq_driver_resolve_freq. */
unsigned int cached_target_freq;
int cached_resolved_idx;
/* Synchronization for frequency transitions */
bool transition_ongoing; /* Tracks transition status */
spinlock_t transition_lock;
wait_queue_head_t transition_wait;
struct task_struct *transition_task; /* Task which is doing the transition */
/* cpufreq-stats */
struct cpufreq_stats *stats;
/* For cpufreq driver's internal use */
void *driver_data;
};
struct cpufreq_governor {
char name[CPUFREQ_NAME_LEN];
int (*init)(struct cpufreq_policy *policy);
void (*exit)(struct cpufreq_policy *policy);
int (*start)(struct cpufreq_policy *policy);
void (*stop)(struct cpufreq_policy *policy);
void (*limits)(struct cpufreq_policy *policy);
ssize_t (*show_setspeed) (struct cpufreq_policy *policy,
char *buf);
int (*store_setspeed) (struct cpufreq_policy *policy,
unsigned int freq);
/* For governors which change frequency dynamically by themselves */
bool dynamic_switching;
struct list_head governor_list;
struct module *owner;
};
struct cpufreq_driver {
char name[CPUFREQ_NAME_LEN];
u8 flags;
void *driver_data;
/* needed by all drivers */
int (*init)(struct cpufreq_policy *policy);//core对驱动进行必要的初始化操作---------------------------*/
int (*verify)(struct cpufreq_policy *policy);// core通过调用该回调函数检查policy的参数是否被驱动支持----*/
/* define one out of two */
int (*setpolicy)(struct cpufreq_policy *policy);// 如果不支持governor选择合适的运行频率,则实现该函数,系统只能支持CPUFREQ_POLICY_POWERSAVE和CPUFREQ_POLICY_PERFORMANCE这两种工作频率; 反之,调用target设定的回调,注意4.19的已经废弃,进行设定governor所需要的频率--------------------------*/;
/*
* On failure, should always restore frequency to policy->restore_freq
* (i.e. old freq).
*/
int (*target)(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation); /* Deprecated */
int (*target_index)(struct cpufreq_policy *policy,
unsigned int index);
unsigned int (*fast_switch)(struct cpufreq_policy *policy,
unsigned int target_freq);
/*
* Caches and returns the lowest driver-supported frequency greater than
* or equal to the target frequency, subject to any driver limitations.
* Does not set the frequency. Only to be implemented for drivers with
* target().
*/
unsigned int (*resolve_freq)(struct cpufreq_policy *policy,
unsigned int target_freq);
/* 是否选择中间过渡频率,以及过渡频率是否driver会立即执行
* Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION
* unset.
*
* get_intermediate should return a stable intermediate frequency
* platform wants to switch to and target_intermediate() should set CPU
* to to that frequency, before jumping to the frequency corresponding
* to 'index'. Core will take care of sending notifications and driver
* doesn't have to handle them in target_intermediate() or
* target_index().
*
* Drivers can return '0' from get_intermediate() in case they don't
* wish to switch to intermediate frequency for some target frequency.
* In that case core will directly call ->target_index().
*/
unsigned int (*get_intermediate)(struct cpufreq_policy *policy,
unsigned int index);
int (*target_intermediate)(struct cpufreq_policy *policy,
unsigned int index);
/* should be defined, if possible */
unsigned int (*get)(unsigned int cpu);
/* optional */
int (*bios_limit)(int cpu, unsigned int *limit);
int (*exit)(struct cpufreq_policy *policy);
void (*stop_cpu)(struct cpufreq_policy *policy);
int (*suspend)(struct cpufreq_policy *policy);
int (*resume)(struct cpufreq_policy *policy);
/* Will be called after the driver is fully initialized */
void (*ready)(struct cpufreq_policy *policy);
struct freq_attr **attr;
/* platform specific boost support code */
bool boost_enabled;
int (*set_boost)(int state);
};
CPUFreq governor核心数据结构
//计算cpu负载需要使用到的一些辅助变量整合在了一起
/* Per cpu structures demand based switching*/
struct cpu_dbs_info {
u64 prev_cpu_idle;
u64 prev_update_time;
u64 prev_cpu_nice;
/*
* Used to keep track of load in the previous interval. However, when
* explicitly set to zero, it is used as a flag to ensure that we copy
* the previous load to the current interval only once, upon the first
* wake-up from idle.
*/
unsigned int prev_load;
struct update_util_data update_util;
struct policy_dbs_info *policy_dbs;
};
cpufreq notifiers
- CPUFreq的通知系统使用了内核的标准通知接口。它对外提供了两个通知事件:policy通知和transition通知。
- policy通知用于通知其它模块cpu的policy需要改变,每次policy改变时,该通知链上的回调将会用不同的事件参数被调用3次,分别是:
- CPUFREQ_ADJUST 只要有需要,所有的被通知者可以在此时修改policy的限制信息,比如温控系统可能会修改在大允许运行的频率;
- CPUFREQ_INCOMPATIBLE 只是为了避免硬件错误的情况下,可以在该通知中修改policy的限制信息;
- CPUFREQ_NOTIFY 真正切换policy前,该通知会发往所有的被通知者;
- transition通知链用于在驱动实施调整cpu的频率时,用于通知相关的注册者。每次调整频率时,该通知会发出两次通知事件:
- CPUFREQ_PRECHANGE 调整前的通知;
- CPUFREQ_POSTCHANGE 完成调整后的通知;
- 当检测到因系统进入suspend而造成频率被改变时,以下通知消息会被发出:CPUFREQ_RESUMECHANGE;
CPUFreq Core层
初始化
struct kobject *cpufreq_global_kobject;
EXPORT_SYMBOL(cpufreq_global_kobject);//将函数或者变量导出到内核空间,使得内核中的所有代码都可以使用
static int __init cpufreq_core_init(void)
{
if (cpufreq_disabled()) //1
return -ENODEV;
cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj);//2
BUG_ON(!cpufreq_global_kobject);
register_syscore_ops(&cpufreq_syscore_ops);//3
return 0;
}
module_param(off, int, 0444);//4
core_initcall(cpufreq_core_init);
- cpufreq_disabled()返回off的值,即freq模块是否被启用,off默认是1,module_param可以在模块加载时给予off赋值;
- cpu_subsys是在系统中已经构建好的全局变量,dev_root记录了其在bus中对应的device结构,每个设备都有对应的struct device;上述在/sys/devices/system/cpu目录下 创建一个名为cpufreq的kobject;
- 注册一个回调函数吗,在关机的时候且cpu0外的cpu都停止时关掉cpufreq,确保cpufreq没有持有任何锁和信号量?首先关掉每一个governor,调用cpufreq driver 的sudpended的函数;设置cpufreq_suspend= true;
注册cpufreq_governor
int cpufreq_register_governor(struct cpufreq_governor *governor)
{
int err;
if (!governor)
return -EINVAL;
if (cpufreq_disabled())
return -ENODEV;
mutex_lock(&cpufreq_governor_mutex);
err = -EBUSY;
if (!find_governor(governor->name)) {
err = 0;
list_add(&governor->governor_list, &cpufreq_governor_list);//1
}
mutex_unlock(&cpufreq_governor_mutex);
return err;
}
- 通过将governore添加到cpufreq_governor_list链表之上;
注册cpufreq_driver驱动
/**
* cpufreq_register_driver - register a CPU Frequency driver
* @driver_data: A struct cpufreq_driver containing the values#
* submitted by the CPU Frequency driver.
*
* Registers a CPU Frequency driver to this core code. This code
* returns zero on success, -EEXIST when another driver got here first
* (and isn't unregistered in the meantime).
*
*/
int cpufreq_register_driver(struct cpufreq_driver *driver_data)
{
unsigned long flags;
int ret;
if (cpufreq_disabled())
return -ENODEV;
if (!driver_data || !driver_data->verify || !driver_data->init ||
!(driver_data->setpolicy || driver_data->target_index ||
driver_data->target) ||
(driver_data->setpolicy && (driver_data->target_index ||
driver_data->target)) ||
(!!driver_data->get_intermediate != !!driver_data->target_intermediate))//1
return -EINVAL;
pr_debug("trying to register driver %s\n", driver_data->name);
/* Protect against concurrent CPU online/offline. */
cpus_read_lock();
write_lock_irqsave(&cpufreq_driver_lock, flags);
if (cpufreq_driver) {//2
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
ret = -EEXIST;
goto out;
}
cpufreq_driver = driver_data;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
if (driver_data->setpolicy)
driver_data->flags |= CPUFREQ_CONST_LOOPS;//3
if (cpufreq_boost_supported()) {// 4
ret = create_boost_sysfs_file();
if (ret)
goto err_null_driver;
}
ret = subsys_interface_register(&cpufreq_interface);//5
if (ret)
goto err_boost_unreg;
if (!(cpufreq_driver->flags & CPUFREQ_STICKY) &&
list_empty(&cpufreq_policy_list)) {
/* if all ->init() calls failed, unregister */
ret = -ENODEV;
pr_debug("%s: No CPU initialized for driver %s\n", __func__,
driver_data->name);
goto err_if_unreg;
}
ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN,
"cpufreq:online",
cpuhp_cpufreq_online,
cpuhp_cpufreq_offline);//6
if (ret < 0)
goto err_if_unreg;
hp_online = ret;
ret = 0;
pr_debug("driver %s up and running\n", driver_data->name);
goto out;
err_if_unreg:
subsys_interface_unregister(&cpufreq_interface);
err_boost_unreg:
remove_boost_sysfs_file();
err_null_driver:
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_driver = NULL;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
out:
cpus_read_unlock();
return ret;
}
- cpufreq_driver驱动中必须要实现的函数功能;
- cpufreq_driver驱动要求只能实现一次驱动注册;
- 告诉内核loops_per_jiffy或者其他的内核constants不会因为CPU频率的变化而变化;
- 查看驱动注册是否支持boost,如果支持就在cpufreq下建立boost节点;
- 遍历cpu,给每个cpu设立一个cpufreq_policy;通过该步骤,每个cpu目录下会有一个cpufreq的节点,其是cpufreq目录下的各种policy的的链接;
- cpu hotplug时候的回调函数;后面的两个参数为回调函数;
注册cpufreq_driver驱动:subsys_interface_register() 用来建立cpufreq_policy 包括参数的设置以及governor的选择
- subsys_interface_register(struct subsys_interface *sif):调用相应的子系统接口的add_dev函数,循环,将子系统的设备进行某项操作:
// 调用相应的子系统接口的add_dev函数,循环
int subsys_interface_register(struct subsys_interface *sif)
{
struct bus_type *subsys;
struct subsys_dev_iter iter;
struct device *dev;
if (!sif || !sif->subsys)
return -ENODEV;
subsys = bus_get(sif->subsys);
if (!subsys)
return -EINVAL;
mutex_lock(&subsys->p->mutex);
list_add_tail(&sif->node, &subsys->p->interfaces);
if (sif->add_dev) {
subsys_dev_iter_init(&iter, subsys, NULL, NULL);
while ((dev = subsys_dev_iter_next(&iter)))
sif->add_dev(dev, sif);//调用add_dev函数
subsys_dev_iter_exit(&iter);
}
mutex_unlock(&subsys->p->mutex);
return 0;
}
EXPORT_SYMBOL_GPL(subsys_interface_register);
- cpufreq_interface
static struct subsys_interface cpufreq_interface = {
.name = "cpufreq",
.subsys = &cpu_subsys,
.add_dev = cpufreq_add_dev,
.remove_dev = cpufreq_remove_dev,
};
- subsys_interface_register(struct subsys_interface *sif),调用子系统接口的add_dev函数
/** 将cpufreq加入到某个cpu目录下
* cpufreq_add_dev - the cpufreq interface for a CPU device.
* @dev: CPU device.
* @sif: Subsystem interface structure pointer (not used)
*/
static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
{
struct cpufreq_policy *policy;
unsigned cpu = dev->id;
int ret;
dev_dbg(dev, "%s: adding CPU%u\n", __func__, cpu);
if (cpu_online(cpu)) {
ret = cpufreq_online(cpu);// 2.2
if (ret)
return ret;
}
/* Create sysfs link on CPU registration */
policy = per_cpu(cpufreq_cpu_data, cpu);//获得cpufreq_policy
if (policy)
add_cpu_dev_symlink(policy, cpu);//2.1 将policy与cpu下的建立cpufreq相互关联
return 0;
}
2.1 add_cpu_dev_symlink():将policy与cpu下的建立cpufreq相互关联
static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
if (!dev)
return;
if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))//2.1.1
return;
dev_dbg(dev, "%s: Adding symlink\n", __func__);
if (sysfs_create_link(&dev->kobj, &policy->kobj, "cpufreq"))// (kobj, target, name)
dev_err(dev, "cpufreq symlink creation failed\n");
}
2.1.1 cpumask_test_and_set_cpu()检测并且将cpu放在cpu
/**
* cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
* @cpumask: the cpumask pointer
*
* Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
*
* test_and_set_bit wrapper for cpumasks.
*/
static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}
2.2 static int cpufreq_online(unsigned int cpu):将cpufreq的policy与sysfs用户接口,governor以及驱动相互关联
static int cpufreq_online(unsigned int cpu)
{
struct cpufreq_policy *policy;
bool new_policy;
unsigned long flags;
unsigned int j;
int ret;
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
/* Check if this CPU already has a policy to manage it */
policy = per_cpu(cpufreq_cpu_data, cpu);
if (policy) {
WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus));
if (!policy_is_inactive(policy))// 判断policy的cpus是否为空,即是在线的cpu是否为空;
return cpufreq_add_policy_cpu(policy, cpu);// policy已经分配好 1. 暂停governor(首先判断cpufreq_driver是否有target或者target_index函数,只有有,才支持governor调频) 2.设置cpu到policy->cpus;3.启动governor
/* This is the only online CPU for the policy. Start over. 当policy的cpus为空时*/
new_policy = false;
down_write(&policy->rwsem);
policy->cpu = cpu;
policy->governor = NULL;
up_write(&policy->rwsem);
} else { // policy还没有分配,就分配内存空间
new_policy = true;
policy = cpufreq_policy_alloc(cpu);//分配内存,在cpufreq下创建节点,设立管理cpu为当前cpu
if (!policy)
return -ENOMEM;
}
cpumask_copy(policy->cpus, cpumask_of(cpu));//同一簇的cpu使用同一个policy
/* call driver. From then on the cpufreq must be able
* to accept all calls to ->verify and ->setpolicy for this CPU
*/
ret = cpufreq_driver->init(policy);//调用驱动初始化policy
if (ret) {
pr_debug("initialization failed\n");
goto out_free_policy;
}
ret = cpufreq_table_validate_and_sort(policy);// 根据频率进行policy排序
if (ret)
goto out_exit_policy;
down_write(&policy->rwsem);
if (new_policy) {
/* related_cpus should at least include policy->cpus. */
cpumask_copy(policy->related_cpus, policy->cpus);// releated_cpus为同一个簇中的CPU
}
/*
* affected cpus must always be the one, which are online. We aren't
* managing offline cpus here.
*/
cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);//cpus为当前簇中在线的cpu
if (new_policy) {
policy->user_policy.min = policy->min;
policy->user_policy.max = policy->max;
for_each_cpu(j, policy->related_cpus) { // cpufreq_cpu_date就是指向policy
per_cpu(cpufreq_cpu_data, j) = policy;
add_cpu_dev_symlink(policy, j);// 为cpuj 创建policy节点
}
} else {
policy->min = policy->user_policy.min;
policy->max = policy->user_policy.max;
}
if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
policy->cur = cpufreq_driver->get(policy->cpu);
if (!policy->cur) {
pr_err("%s: ->get() failed\n", __func__);
goto out_destroy_policy;
}
}
/*
* Sometimes boot loaders set CPU frequency to a value outside of
* frequency table present with cpufreq core. In such cases CPU might be
* unstable if it has to run on that frequency for long duration of time
* and so its better to set it to a frequency which is specified in
* freq-table. This also makes cpufreq stats inconsistent as
* cpufreq-stats would fail to register because current frequency of CPU
* isn't found in freq-table.
*
* Because we don't want this change to effect boot process badly, we go
* for the next freq which is >= policy->cur ('cur' must be set by now,
* otherwise we will end up setting freq to lowest of the table as 'cur'
* is initialized to zero).
*
* We are passing target-freq as "policy->cur - 1" otherwise
* __cpufreq_driver_target() would simply fail, as policy->cur will be
* equal to target-freq.
*/
if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK)
&& has_target()) {
/* Are we running at unknown frequency ? */
ret = cpufreq_frequency_table_get_index(policy, policy->cur);
if (ret == -EINVAL) {
/* Warn user and fix it */
pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n",
__func__, policy->cpu, policy->cur);
ret = __cpufreq_driver_target(policy, policy->cur - 1,
CPUFREQ_RELATION_L);
/*
* Reaching here after boot in a few seconds may not
* mean that system will remain stable at "unknown"
* frequency for longer duration. Hence, a BUG_ON().
*/
BUG_ON(ret);
pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n",
__func__, policy->cpu, policy->cur);
}
}
if (new_policy) {
ret = cpufreq_add_dev_interface(policy);//为policy创立sysfs系统节点
if (ret)
goto out_destroy_policy;
cpufreq_stats_create_table(policy);//创建频率表等
write_lock_irqsave(&cpufreq_driver_lock, flags);
list_add(&policy->policy_list, &cpufreq_policy_list);//加入链表
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
}
ret = cpufreq_init_policy(policy);//获得governor
if (ret) {
pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",
__func__, cpu, ret);
/* cpufreq_policy_free() will notify based on this */
new_policy = false;
goto out_destroy_policy;
}
up_write(&policy->rwsem);
kobject_uevent(&policy->kobj, KOBJ_ADD);
/* Callback for handling stuff after policy is ready */
if (cpufreq_driver->ready)
cpufreq_driver->ready(policy);
pr_debug("initialization complete\n");
return 0;
out_destroy_policy:
for_each_cpu(j, policy->real_cpus)
remove_cpu_dev_symlink(policy, get_cpu_device(j));
up_write(&policy->rwsem);
out_exit_policy:
if (cpufreq_driver->exit)
cpufreq_driver->exit(policy);
out_free_policy:
cpufreq_policy_free(policy);
return ret;
}
注册通知链
/**
* cpufreq_register_notifier - register a driver with cpufreq
* @nb: notifier function to register
* @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
*
* Add a driver to one of two lists: either a list of drivers that
* are notified about clock rate changes (once before and once after
* the transition), or a list of drivers that are notified about
* changes in cpufreq policy.
*
* This function may sleep, and has the same return conditions as
* blocking_notifier_chain_register.
*/
int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
{
int ret;
if (cpufreq_disabled())
return -EINVAL;
switch (list) {// 定义两条通知链
case CPUFREQ_TRANSITION_NOTIFIER:
mutex_lock(&cpufreq_fast_switch_lock);
if (cpufreq_fast_switch_count > 0) {
mutex_unlock(&cpufreq_fast_switch_lock);
return -EBUSY;
}
ret = srcu_notifier_chain_register(
&cpufreq_transition_notifier_list, nb);
if (!ret)
cpufreq_fast_switch_count--;
mutex_unlock(&cpufreq_fast_switch_lock);
break;
case CPUFREQ_POLICY_NOTIFIER:
ret = blocking_notifier_chain_register(
&cpufreq_policy_notifier_list, nb);
break;
default:
ret = -EINVAL;
}
return ret;
}
EXPORT_SYMBOL(cpufreq_register_notifier);
- policy通知用于通知其它模块cpu的policy需要改变,每次policy改变时,该通知链上的回调将会用不同的事件参数被调用3次,分别是:
- CPUFREQ_ADJUST 只要有需要,所有的被通知者可以在此时修改policy的限制信息,比如温控系统可能会修改在大允许运行的频率;
- CPUFREQ_INCOMPATIBLE 只是为了避免硬件错误的情况下,可以在该通知中修改policy的限制信息;
- CPUFREQ_NOTIFY 真正切换policy前,该通知会发往所有的被通知者;
- transition通知链用于在驱动实施调整cpu的频率时,用于通知相关的注册者。每次调整频率时,该通知会发出两次通知事件:
- CPUFREQ_PRECHANGE 调整前的通知;
- CPUFREQ_POSTCHANGE 完成调整后的通知;
CPUFreq driver层
简介
cpufreq driver主要完成平台相关的CPU频率/电压的控制;主要是定义一个struct cpufreq_driver变量,填充必要的字段,并根据平台的特性,实现其中的回调函数;然后注册到系统中去。
cpufreq_driver的 init函数
代码
从device tree中获取对应的clock,regulator配置最小最大频率等;注册cpufreq_driver驱动;加载该模块时候会执行该函数
cpufreq_driver的verity函数
代码
确定policy->min和policy->max之间至少有一个有效的频率
cpufreq_driver的get函数
代码
获得当前cpu的频率
cpufreq_driver的target系列函数
代码
实际调频调压的操作者:1. 调压调频设置参数,主要是进一步调用对应的cpu的调频调压函数,进一步实现设置频率以及电压;2. 修改policy的对应参数,修改时间,对应的涉及cpu的参数;3. 发出通知链告诉调频过程;
CPUFreq_governor
简介
- 公共逻辑代码位置:cpufreq_governor.c
- 检测系统的负载情况,根据当前的负载,选择可供使用的频率
常见的governor
- Performance: 性能优先的governor,直接将cpu频率设置为policy->{min,max}中的最大值。一般会被选做默认的governor以节省系统启动时间,之后再切换;
- 功耗优先的governor,直接将cpu频率设置为policy->{min,max}中的最小值;
- Userspace: 由用户空间程序通过scaling_setspeed文件修改频率。一般用作调试;
- Ondemand:根据CPU的当前使用率,动态的调节CPU频率;
- interactive: 交互式动态调节CPU频率,与Ondemand类似,由谷歌开发并广泛使用于手机平板等设备上;
- schedutil:利用负载回调机制,schedutil将自己的调频策略注册到hook中,在负载变化时会调用相应的调频策略甚至执行调频动作;优点是可以将scheduler与调频建立更加紧密的联系,同时提高了性能和功耗的表现;
schedutil调频策略 cpufreq_schedutil.c kernel5.0
- 注释
- sysfs: 用户接口,一些用户参数设置;暴露参数;
- scheduler负载跟踪;
- schedutil:实际的目标调频的获得;
schedutil的初始化以及启动
static int sugov_init(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
struct sugov_tunables *tunables;
int ret = 0;
/* State should be equivalent to EXIT */
if (policy->governor_data)// 是否已经绑定----------------------------------1
return -EBUSY;
-
cpufreq_enable_fast_switch(policy); //启用快速切换的功能-------------------2
sg_policy = sugov_policy_alloc(policy);//分配内存-------------------------3
if (!sg_policy) {
ret = -ENOMEM;
goto disable_fast_switch;
}
ret = sugov_kthread_create(sg_policy);//创建并绑定线程,set_cpu_allowed---4
if (ret)
goto free_sg_policy;
mutex_lock(&global_tunables_lock);
if (global_tunables) {//创建用户层接口-----------------------------------5
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
goto stop_kthread;
}
policy->governor_data = sg_policy;
sg_policy->tunables = global_tunables;
gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);//注册回调函数--6
goto out;
}
tunables = sugov_tunables_alloc(sg_policy);//无则创建-------------7
if (!tunables) {
ret = -ENOMEM;
goto stop_kthread;
}
tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);//切换延迟,policy存在则使用policy,没有则使用cpuinfo------8
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
get_governor_parent_kobj(policy), "%s",
schedutil_gov.name);//创建一个节点------------------9
if (ret)
goto fail;
out:
mutex_unlock(&global_tunables_lock);
return 0;
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
sugov_tunables_free(tunables);
stop_kthread:
sugov_kthread_stop(sg_policy);
mutex_unlock(&global_tunables_lock);
free_sg_policy:
sugov_policy_free(sg_policy);
disable_fast_switch:
cpufreq_disable_fast_switch(policy);
pr_err("initialization failed (error %d)\n", ret);//创建完成-------------------10
return ret;
}
static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
unsigned int cpu;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->limits_changed = false;
sg_policy->cached_raw_freq = 0;
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);//检测driver flag是否与给定的相同---1
for_each_cpu(cpu, policy->cpus) {//创建一个sugov_cpu结构体(遍历cpu_policy),每个cpu的单独信息,记录cpu及对应的sg_policy--2
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
}
if (policy_is_shared(policy))//根据标志 选择函数-----------------------3
uu = sugov_update_shared;
else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
uu = sugov_update_single_perf;
else
uu = sugov_update_single_freq;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);//结构体变量--------------4
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);//注册回调函数----5
}
return 0;
}
static void sugov_limits(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
if (!policy->fast_switch_enabled) {
mutex_lock(&sg_policy->work_lock);
cpufreq_policy_apply_limits(policy);//根据sg_policy设定的数值更改policy的max以及min----1
mutex_unlock(&sg_policy->work_lock);
}
sg_policy->limits_changed = true;//设置标志位-------------------2
}
schedutil调频的触发时机
- CFS负载变化或者RT/DL任务状态更新就可以启动调频,通过调用cpufreq_update_util()函数即可实现,通过回调注册的sugov_update_shared 或者sugov_update_singled以及sugov_update_singled_perf进行调频;
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
* @rq: Runqueue to carry out the update for.
* @flags: Update reason flags.
*
* This function is called by the scheduler on the CPU whose utilization is
* being updated.
*
* It can only be called from RCU-sched read-side critical sections.
*
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long.
* That is not guaranteed to happen if the updates are only triggered from CFS
* and DL, though, because they may not be coming in if only RT tasks are
* active all the time (or there are RT tasks only).
*
* As a workaround for that issue, this function is called periodically by the
* RT sched class to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with
* solutions targeted more specifically at RT tasks.
*/
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
cpu_of(rq)));
if (data)
data->func(data, rq_clock(rq), flags);// 调用注册链上的回调函数---------1
}
- 触发的时机:
- DL任务更新;
- 进出rt队列;
- 当cfs运行队列中,cpu的任务负载发生变化时;
schedutil调频的决策和频率切换
- 调频函数在sugov_start()函数中调用
- sugov_update_single函数
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long util, max;
unsigned int next_f;
unsigned int cached_freq = sg_policy->cached_raw_freq;
sugov_iowait_boost(sg_cpu, time, flags);//更新ioboost status of a cpu,根据时间以及flags判定是否更新io_booost状态;
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);// 忽略dl任务的影响
if (!sugov_should_update_freq(sg_policy, time))//判断更新时间
return;
util = sugov_get_util(sg_cpu);//获得cpu利用率,考虑dl, rt, cfs
max = sg_cpu->max;
util = sugov_iowait_apply(sg_cpu, time, util, max);//根据io boost重新计算util
next_f = get_next_freq(sg_policy, util, max);//根据util与freq的映射关系找到对用的freq,1.25倍util
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {//进一步判断
next_f = sg_policy->next_freq;
/* Restore cached freq as next_freq has changed */
sg_policy->cached_raw_freq = cached_freq;
}
/*
* This code runs under rq->lock for the target CPU, so it won't run
* concurrently on two different CPUs for the same target and it is not
* necessary to acquire the lock in the fast switch case.
*/
if (sg_policy->policy->fast_switch_enabled) {
sugov_fast_switch(sg_policy, time, next_f);
} else {
raw_spin_lock(&sg_policy->update_lock);
sugov_deferred_update(sg_policy, time, next_f);//根据驱动以及flag,判断是否确定要更新并设定,在workqueue上进行排队
raw_spin_unlock(&sg_policy->update_lock);
}
}
- sugov_next_freq_shared()遍历policy对应的util,找最大利用率;
static void
sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int next_f;
raw_spin_lock(&sg_policy->update_lock);
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
if (sg_policy->policy->fast_switch_enabled)
sugov_fast_switch(sg_policy, time, next_f);
else
sugov_deferred_update(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util = 0, max = 1;
unsigned int j;
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
}
return get_next_freq(sg_policy, util, max);
}