linux 内核动态调频,linux cpufreq interactive调频代码实现下

最新推荐文章于 2024-03-27 09:51:41 发布

莈莈

最新推荐文章于 2024-03-27 09:51:41 发布

阅读量543

点赞数

文章标签： linux 内核动态调频

上一篇文章简单介绍了cpufreq初始化过程，最后会调用到cpufreq_init_policy，

在这里面会启动对应的governor。

static void cpufreq_init_policy(struct cpufreq_policy *policy)

{

struct cpufreq_policy new_policy;

int ret = 0;

memcpy(&new_policy, policy, sizeof(*policy));

/* assure that the starting sequence is run in cpufreq_set_policy */

policy->governor = NULL;

/* set default policy */

ret = cpufreq_set_policy(policy, &new_policy);

policy->user_policy.policy = policy->policy;

policy->user_policy.governor = policy->governor;

if (ret) {

pr_debug("setting policy failed\n");

if (cpufreq_driver->exit)

cpufreq_driver->exit(policy);

}

* policy : current policy.

* new_policy: policy to be set.

static int cpufreq_set_policy(struct cpufreq_policy *policy,

struct cpufreq_policy *new_policy)

{

int ret = 0, failed = 1;

pr_debug("setting new policy for CPU %u: %u - %u kHz\n", new_policy->cpu,

new_policy->min, new_policy->max);

memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));

if (new_policy->min > policy->user_policy.max

|| new_policy->max < policy->user_policy.min) {

ret = -EINVAL;

goto error_out;

}

/* verify the cpu speed can be set within this limit */

//检测policy对应的cpu 最大最小频率是否符合要求

ret = cpufreq_driver->verify(new_policy);

if (ret)

goto error_out;

/* adjust if necessary - all reasons */

blocking_notifier_call_chain(&cpufreq_policy_notifier_list,

CPUFREQ_ADJUST, new_policy);

/* adjust if necessary - hardware incompatibility*/

blocking_notifier_call_chain(&cpufreq_policy_notifier_list,

CPUFREQ_INCOMPATIBLE, new_policy);

* verify the cpu speed can be set within this limit, which might be

* different to the first one

ret = cpufreq_driver->verify(new_policy);

if (ret)

goto error_out;

/* notification of the new policy */

blocking_notifier_call_chain(&cpufreq_policy_notifier_list,

CPUFREQ_NOTIFY, new_policy);

policy->min = new_policy->min;

policy->max = new_policy->max;

trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);

pr_debug("new min and max freqs are %u - %u kHz\n",

policy->min, policy->max);

if (cpufreq_driver->setpolicy) {//驱动未实现此函数

policy->policy = new_policy->policy;

pr_debug("setting range\n");

ret = cpufreq_driver->setpolicy(new_policy);

} else {

if (new_policy->governor != policy->governor) {//前面设置了policy->governor为NULL,两个肯定不同

/* save old, working values */

struct cpufreq_governor *old_gov = policy->governor;

pr_debug("governor switch\n");

/* end old governor */

if (policy->governor) {

__cpufreq_governor(policy, CPUFREQ_GOV_STOP);

up_write(&policy->rwsem);

__cpufreq_governor(policy,

CPUFREQ_GOV_POLICY_EXIT);

down_write(&policy->rwsem);

}

/* start new governor */

//执行governor init，start，limit函数

policy->governor = new_policy->governor;

if (!__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT)) {

if (!__cpufreq_governor(policy, CPUFREQ_GOV_START)) {

failed = 0;

} else {

up_write(&policy->rwsem);

__cpufreq_governor(policy,

CPUFREQ_GOV_POLICY_EXIT);

down_write(&policy->rwsem);

}

if (failed) {

/* new governor failed, so re-start old one */

pr_debug("starting governor %s failed\n",

policy->governor->name);

if (old_gov) {

policy->governor = old_gov;

__cpufreq_governor(policy,

CPUFREQ_GOV_POLICY_INIT);

__cpufreq_governor(policy,

CPUFREQ_GOV_START);

}

ret = -EINVAL;

goto error_out;

}

/* might be a policy change, too, so fall through */

}

pr_debug("governor: change or update limits\n");

ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);

}

error_out:

return ret;

}

static int __cpufreq_governor(struct cpufreq_policy *policy,

unsigned int event)

{

int ret;

/* Only must be defined when default governor is known to have latency

restrictions, like e.g. conservative or ondemand.

That this is the case is already ensured in Kconfig

#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE

struct cpufreq_governor *gov = &cpufreq_gov_performance;

#else

struct cpufreq_governor *gov = NULL;

#endif

if (policy->governor->max_transition_latency &&

policy->cpuinfo.transition_latency >

policy->governor->max_transition_latency) {

if (!gov)

return -EINVAL;

else {

printk(KERN_WARNING "%s governor failed, too long"

" transition latency of HW, fallback"

" to %s governor\n",

policy->governor->name,

gov->name);

policy->governor = gov;

}

if (event == CPUFREQ_GOV_POLICY_INIT)

if (!try_module_get(policy->governor->owner))

return -EINVAL;

pr_debug("__cpufreq_governor for CPU %u, event %u\n",

policy->cpu, event);

mutex_lock(&cpufreq_governor_lock);

if ((policy->governor_enabled && event == CPUFREQ_GOV_START)

|| (!policy->governor_enabled

&& (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) {

mutex_unlock(&cpufreq_governor_lock);

return -EBUSY;

}

if (event == CPUFREQ_GOV_STOP)

policy->governor_enabled = false;

else if (event == CPUFREQ_GOV_START)

policy->governor_enabled = true;

mutex_unlock(&cpufreq_governor_lock);

//执行governor主体函数

//初始化时先采用performance模式，以最高频率运行，保证开机速度，

//之后会通过transition notify切换到interactive

ret = policy->governor->governor(policy, event);

if (!ret) {

if (event == CPUFREQ_GOV_POLICY_INIT)

policy->governor->initialized++;

else if (event == CPUFREQ_GOV_POLICY_EXIT)

policy->governor->initialized--;

} else {

/* Restore original values */

mutex_lock(&cpufreq_governor_lock);

if (event == CPUFREQ_GOV_STOP)

policy->governor_enabled = true;

else if (event == CPUFREQ_GOV_START)

policy->governor_enabled = false;

mutex_unlock(&cpufreq_governor_lock);

}

if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||

((event == CPUFREQ_GOV_POLICY_EXIT) && !ret))

module_put(policy->governor->owner);

return ret;

}

//interactive 使用到的数据结构,interactive是通过定时采样cpu workload，确定cpu的工作频率的，

//主要数据结构是timer跟struct cpufreq_interactive_tunables

struct cpufreq_interactive_cpuinfo {

//重要的两个timer list

struct timer_list cpu_timer;

struct timer_list cpu_slack_timer;

spinlock_t load_lock; /* protects the next 4 fields */

u64 time_in_idle; //cpu idle 时间

u64 time_in_idle_timestamp;//更新time_in_idle时间戳

u64 cputime_speedadj;

u64 cputime_speedadj_timestamp; //cputime_speedadj_timestamp=time_in_idle_timestamp，定时器每次启动时统计idle的时间戳

u64 last_evaluated_jiffy;

struct cpufreq_policy *policy;

struct cpufreq_frequency_table *freq_table; //frequency?

spinlock_t target_freq_lock; /*protects target freq */

unsigned int target_freq;//目标频率

unsigned int floor_freq;

unsigned int max_freq;

unsigned int min_freq;

u64 floor_validate_time;

u64 local_fvtime; /* per-cpu floor_validate_time */

u64 hispeed_validate_time; /* cluster hispeed_validate_time */

u64 local_hvtime; /* per-cpu hispeed_validate_time */

u64 max_freq_hyst_start_time;

struct rw_semaphore enable_sem;

bool reject_notification;

int governor_enabled;//governor有效标示

struct cpufreq_interactive_tunables *cached_tunables;

int first_cpu;

};

static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo);

/* realtime thread handles frequency scaling */

static struct task_struct *speedchange_task;

static cpumask_t speedchange_cpumask;

static spinlock_t speedchange_cpumask_lock;

static struct mutex gov_lock;

static int set_window_count;

static int migration_register_count;

static struct mutex sched_lock;

/* Target load. Lower values result in higher CPU speeds. */

#define DEFAULT_TARGET_LOAD 90

static unsigned int default_target_loads[] = {DEFAULT_TARGET_LOAD};

#define DEFAULT_TIMER_RATE (20 * USEC_PER_MSEC)

#define DEFAULT_ABOVE_HISPEED_DELAY DEFAULT_TIMER_RATE

static unsigned int default_above_hispeed_delay[] = {//20000us

DEFAULT_ABOVE_HISPEED_DELAY };

struct cpufreq_interactive_tunables {

int usage_count;//tunable引用计数

/* Hi speed to bump to from lo speed when load burst (default max) */

//负载超过go_hispeed_load时，频率就被增大到此数值，默认为policy初始化时最大值

//这个值是个中间值，高负载持续时间超过above_hispeed_delay，cpu频率继续升高

unsigned int hispeed_freq;

/* Go to hi speed when CPU load at or above this value. */

#define DEFAULT_GO_HISPEED_LOAD 99

//高频阈值，默认是99%,超过此值就提高cpu 频率，否则降频

unsigned long go_hispeed_load;

/* Target load. Lower values result in higher CPU speeds. */

spinlock_t target_loads_lock;

//数组，表示cpu期望的负载，cpu需要调整频率，使得当前负载接近这个值

//这个值越小，cpu频率就越高，此值是取得频率值

unsigned int *target_loads;

int ntarget_loads;//target_loads 数组大小

* The minimum amount of time to spend at a frequency before we can ramp

* down.

#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)//80ms采样一次

unsigned long min_sample_time;//最小采样时间80000us

* The sample rate of the timer used to increase frequency

//当CPU不处于idel状态时，timer_rate作为定时器采样速率来计算CPU的workload

unsigned long timer_rate;

* Wait this long before raising speed above hispeed, by default a

* single timer interval.

spinlock_t above_hispeed_delay_lock;

//当CPU频率大于等于hispeed_freq，并且此时workload仍在不停增加(continued high load)，系统将等待一个above_hispeed_delay的时间，再提升cpu频率

unsigned int *above_hispeed_delay;//此变量是个数组，不同的频率范围，delay时间不同

//default_above_hispeed_delays数组中元素的个数

int nabove_hispeed_delay;

/* Non-zero means indefinite speed boost active */

echo 1 > /sys/devices/system/cpu/cpufreq/interactive/boost

此时会立即将所有CPU的频率提高到至少hispeed_freq.写入0时，根据workload降低频率.默认为0.

boostpulse，每次触发boost功能时，立即拉高所有CPU的频率到hispeed_freq并保持在该频率至少boostpulse_duration的时间，在这段时间以后，根据当前的workload，频率才允许被降低。

int boost_val;

/* Duration of a boot pulse in usecs */

//每次超频的持续时间

int boostpulse_duration_val;

/* End time of boost pulse in ktime converted to usecs */

//超频结束时间

u64 boostpulse_endtime;

bool boosted;//超频，echo 1 > /sys/devices/system/cpu/cpufreq/interactive/boost， CPU的频率提高到至少hispeed_freq

* Max additional time to wait in idle, beyond timer_rate, at speeds

* above minimum before wakeup to reduce speed, or -1 if unnecessary.

当CPU处于idel状态，此时使用一个可延时定时器，

会导致CPU不能从idel状态苏醒来响应定时器.

定时器的最大的可延时时间用timer_slack表示，默认值80000 uS.

此处采用默认值

#define DEFAULT_TIMER_SLACK (4 * DEFAULT_TIMER_RATE)

int timer_slack_val;

bool io_is_busy;

/* scheduler input related flags */

bool use_sched_load;

bool use_migration_notif;

* Whether to align timer windows across all CPUs. When

* use_sched_load is true, this flag is ignored and windows

* will always be aligned.

bool align_windows;

* Stay at max freq for at least max_freq_hysteresis before dropping

* frequency.

unsigned int max_freq_hysteresis;

};

//interactive初始化，初始化每个cpu定时器，创建内核线程，并将governor注册到cpufreq中

static int __init cpufreq_interactive_init(void)

{

unsigned int i;

struct cpufreq_interactive_cpuinfo *pcpu;

struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };

/* Initalize per-cpu timers */

//遍历cpu，初始化每个cpu timer

for_each_possible_cpu(i) {

pcpu = &per_cpu(cpuinfo, i);

init_timer_deferrable(&pcpu->cpu_timer);

pcpu->cpu_timer.function = cpufreq_interactive_timer;

pcpu->cpu_timer.data = i; //cpu id

init_timer(&pcpu->cpu_slack_timer);//初始化可延期定时器

pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;

spin_lock_init(&pcpu->load_lock);

spin_lock_init(&pcpu->target_freq_lock);

init_rwsem(&pcpu->enable_sem);

}

spin_lock_init(&speedchange_cpumask_lock);

mutex_init(&gov_lock);

mutex_init(&sched_lock);

speedchange_task =

kthread_create(cpufreq_interactive_speedchange_task, NULL,

"cfinteractive");

if (IS_ERR(speedchange_task))

return PTR_ERR(speedchange_task);

sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, ¶m);

get_task_struct(speedchange_task);

/* NB: wake up so the thread does not look hung to the freezer */

//创建内核线程后，加入cpu的运行队列，等待调度执行

wake_up_process(speedchange_task);

//注册governor到内核

return cpufreq_register_governor(&cpufreq_gov_interactive);

}

//在cpufreq中会调用到interactive主体函数cpufreq_governor_interactive

static int cpufreq_governor_interactive(struct cpufreq_policy *policy,

unsigned int event)

{

int rc;

unsigned int j;

struct cpufreq_interactive_cpuinfo *pcpu;

struct cpufreq_frequency_table *freq_table;

struct cpufreq_interactive_tunables *tunables;

unsigned long flags;

int first_cpu;

//每个policy有不同的governor,高通在dst中有设置此项，每个governor有自己的tunables

if (have_governor_per_policy())

tunables = policy->governor_data;

else

tunables = common_tunables;

BUG_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));

switch (event) {

case CPUFREQ_GOV_POLICY_INIT:

if (have_governor_per_policy()) {

WARN_ON(tunables);

} else if (tunables) {//公用一个governor的tunables

tunables->usage_count++;

policy->governor_data = tunables;

return 0;

}

//policy管理的第一个cpu

first_cpu = cpumask_first(policy->related_cpus);

for_each_cpu(j, policy->related_cpus)//遍历policy管理的所有cpu，并给其中的first_cpu赋值

per_cpu(cpuinfo, j).first_cpu = first_cpu;

//获取cpu 0或者policy 第一个cpu的cached_tunables

tunables = restore_tunables(policy);

if (!tunables) {

tunables = alloc_tunable(policy);//分配一个tunables结构

if (IS_ERR(tunables))

return PTR_ERR(tunables);

}

tunables->usage_count = 1;

policy->governor_data = tunables;

if (!have_governor_per_policy()) {

WARN_ON(cpufreq_get_global_kobject());

common_tunables = tunables; //公用tunable

}

//在cpufreq目录下创建interactive目录，并建立属性文件

rc = sysfs_create_group(get_governor_parent_kobj(policy),

get_sysfs_attr());

if (rc) {

kfree(tunables);

policy->governor_data = NULL;

if (!have_governor_per_policy()) {

common_tunables = NULL;

cpufreq_put_global_kobject();

}

return rc;

}

//governor未初始化，需要注册notify

if (!policy->governor->initialized) {

idle_notifier_register(&cpufreq_interactive_idle_nb);//idle notify

cpufreq_register_notifier(&cpufreq_notifier_block, //cpu change freq notify

CPUFREQ_TRANSITION_NOTIFIER);

}

if (tunables->use_sched_load)

cpufreq_interactive_enable_sched_input(tunables);

break;

case CPUFREQ_GOV_POLICY_EXIT:

if (!--tunables->usage_count) {

if (policy->governor->initialized == 1) {//取消notify注册

cpufreq_unregister_notifier(&cpufreq_notifier_block,

CPUFREQ_TRANSITION_NOTIFIER);

idle_notifier_unregister(&cpufreq_interactive_idle_nb);

}

//移除interactive sysfs

sysfs_remove_group(get_governor_parent_kobj(policy),

get_sysfs_attr());

if (!have_governor_per_policy())

cpufreq_put_global_kobject();

common_tunables = NULL;

}

policy->governor_data = NULL;

if (tunables->use_sched_load)

cpufreq_interactive_disable_sched_input(tunables);

break;

case CPUFREQ_GOV_START://关键是启动workload采样定时器

mutex_lock(&gov_lock);

//得到cpu频率表

freq_table = cpufreq_frequency_get_table(policy->cpu);

if (!tunables->hispeed_freq)

tunables->hispeed_freq = policy->max;

//遍历policy管理的所有cpu

for_each_cpu(j, policy->cpus) {

pcpu = &per_cpu(cpuinfo, j);

pcpu->policy = policy;

pcpu->target_freq = policy->cur;

pcpu->freq_table = freq_table;

pcpu->floor_freq = pcpu->target_freq;

pcpu->floor_validate_time =

ktime_to_us(ktime_get());

pcpu->local_fvtime = pcpu->floor_validate_time;

pcpu->hispeed_validate_time =

pcpu->floor_validate_time;

pcpu->local_hvtime = pcpu->floor_validate_time;

pcpu->max_freq = policy->max;

pcpu->min_freq = policy->min;

pcpu->reject_notification = true;

down_write(&pcpu->enable_sem);

del_timer_sync(&pcpu->cpu_timer);//cpu 计算workload定时器

del_timer_sync(&pcpu->cpu_slack_timer);//cpu idle状态定时器

pcpu->last_evaluated_jiffy = get_jiffies_64();

cpufreq_interactive_timer_start(tunables, j);

pcpu->governor_enabled = 1;

up_write(&pcpu->enable_sem);

pcpu->reject_notification = false;

}

mutex_unlock(&gov_lock);

break;

case CPUFREQ_GOV_STOP://停止定时器工作

mutex_lock(&gov_lock);

for_each_cpu(j, policy->cpus) {

pcpu = &per_cpu(cpuinfo, j);

pcpu->reject_notification = true;

down_write(&pcpu->enable_sem);

pcpu->governor_enabled = 0;

pcpu->target_freq = 0;

del_timer_sync(&pcpu->cpu_timer);

del_timer_sync(&pcpu->cpu_slack_timer);

up_write(&pcpu->enable_sem);

pcpu->reject_notification = false;

}

mutex_unlock(&gov_lock);

break;

case CPUFREQ_GOV_LIMITS://设定目标频率后，修改定时器

__cpufreq_driver_target(policy,

policy->cur, CPUFREQ_RELATION_L);

for_each_cpu(j, policy->cpus) {

pcpu = &per_cpu(cpuinfo, j);

down_read(&pcpu->enable_sem);

if (pcpu->governor_enabled == 0) {

up_read(&pcpu->enable_sem);

continue;

}

spin_lock_irqsave(&pcpu->target_freq_lock, flags);

if (policy->max < pcpu->target_freq)

pcpu->target_freq = policy->max;

else if (policy->min > pcpu->target_freq)

pcpu->target_freq = policy->min;

spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

if (policy->min < pcpu->min_freq)

cpufreq_interactive_timer_resched(j, true);

pcpu->min_freq = policy->min;

up_read(&pcpu->enable_sem);

/* Reschedule timer only if policy->max is raised.

* Delete the timers, else the timer callback may

* return without re-arm the timer when failed

* acquire the semaphore. This race may cause timer

* stopped unexpectedly.

if (policy->max > pcpu->max_freq) {

pcpu->reject_notification = true;

down_write(&pcpu->enable_sem);

del_timer_sync(&pcpu->cpu_timer);

del_timer_sync(&pcpu->cpu_slack_timer);

cpufreq_interactive_timer_resched(j, false);

up_write(&pcpu->enable_sem);

pcpu->reject_notification = false;

}

pcpu->max_freq = policy->max;

}

break;

}

return 0;

}

/*继续分析定时器操作，定期计算cpu 当前workload，确定调频的目标频率，调整cpu频率

cpu workload简单来讲就是cpu 活跃时间占采样间隔的百分比

参数data是cpu id ,is_notif=false

static void __cpufreq_interactive_timer(unsigned long data, bool is_notif)

{

u64 now;

unsigned int delta_time;

u64 cputime_speedadj;

int cpu_load;

struct cpufreq_interactive_cpuinfo *pcpu =

&per_cpu(cpuinfo, data);

//cpu 调频信息

struct cpufreq_interactive_tunables *tunables =

pcpu->policy->governor_data;

unsigned int new_freq;

unsigned int loadadjfreq;

unsigned int index;

unsigned long flags;

struct cpufreq_govinfo int_info;

u64 max_fvtime;

if (!down_read_trylock(&pcpu->enable_sem))

return;

if (!pcpu->governor_enabled)

goto exit;

spin_lock_irqsave(&pcpu->load_lock, flags);

pcpu->last_evaluated_jiffy = get_jiffies_64();//获取当前系统启动时间

now = update_load(data);//获取开机总时长

if (tunables->use_sched_load) {//这个值为1 ，表示使用调度器计算出来的负载

* Unlock early to avoid deadlock.

* load_change_callback() for thread migration already

* holds rq lock. Then it locks load_lock to avoid racing

* with cpufreq_interactive_timer_resched/start().

* sched_get_busy() will also acquire rq lock. Thus we

* can't hold load_lock when calling sched_get_busy().

* load_lock used in this function protects time

* and load information. These stats are not used when

* scheduler input is available. Thus unlocking load_lock

* early is perfectly OK.

spin_unlock_irqrestore(&pcpu->load_lock, flags);

//cpu工作总输出

cputime_speedadj = (u64)sched_get_busy(data) *

pcpu->policy->cpuinfo.max_freq;

do_div(cputime_speedadj, tunables->timer_rate);//一个定时周期内的cpu 工作量

//cputime_speedadj=cputime_speedadj/tunables->timer_rate

} else {

delta_time = (unsigned int)//两次统计idle时间的间隔

(now - pcpu->cputime_speedadj_timestamp);

cputime_speedadj = pcpu->cputime_speedadj;//cpu活跃时间跟频率的乘积

spin_unlock_irqrestore(&pcpu->load_lock, flags);

if (WARN_ON_ONCE(!delta_time))

goto rearm;

do_div(cputime_speedadj, delta_time);

}

//kernel不支持浮点运算，才会进行转换 *100运算结果不会出现小数

loadadjfreq = (unsigned int)cputime_speedadj * 100;

int_info.cpu = data;

int_info.load = loadadjfreq / pcpu->policy->max;

int_info.sampling_rate_us = tunables->timer_rate;

//notify通过qcom驱动检查当前workload perf_govinfo_notify

atomic_notifier_call_chain(&cpufreq_govinfo_notifier_list,

CPUFREQ_LOAD_CHANGE, &int_info);

spin_lock_irqsave(&pcpu->target_freq_lock, flags);

//cpu负载跟cpu 工作频率乘积当做一段时间内cpu工作总量

cpu_load = loadadjfreq / pcpu->policy->cur;

//是否有开启超频

tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;

cpu_load 大于go_hispeed_load或者开启超频后，

new_freq需要设置为大于等于tunables->hispeed_freq

if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {

if (pcpu->policy->cur < tunables->hispeed_freq &&

cpu_load <= MAX_LOCAL_LOAD) {//当前频率未达到最大频率，可以直接设置为最大频率

new_freq = tunables->hispeed_freq;

} else {//此时需要选择比hispeed_freq更大的频率

new_freq = choose_freq(pcpu, loadadjfreq);

if (new_freq < tunables->hispeed_freq)

new_freq = tunables->hispeed_freq;

}

} else {//choose_freq 设定新的频率,此时需要降频处理

new_freq = choose_freq(pcpu, loadadjfreq);

}

//检测是否有达到改变频率的条件，尤其是两次采样间隔小于调整到对应频率

//的delay时间的话，就跳过调频

if (cpu_load <= MAX_LOCAL_LOAD &&

pcpu->policy->cur >= tunables->hispeed_freq &&

new_freq > pcpu->policy->cur &&

now - pcpu->hispeed_validate_time <

freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {//不同频率，调频等待的间隔不同

trace_cpufreq_interactive_notyet(

data, cpu_load, pcpu->target_freq,

pcpu->policy->cur, new_freq);

spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

goto rearm;

}

pcpu->local_hvtime = now;

//取freq table中大于或等于new_freq的频率中最小的一个频率

if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,

new_freq, CPUFREQ_RELATION_L,

&index)) {

spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

goto rearm;

}

new_freq = pcpu->freq_table[index].frequency;

//此条件下，同样不需要调频

if (!is_notif && new_freq < pcpu->target_freq &&

now - pcpu->max_freq_hyst_start_time <

tunables->max_freq_hysteresis) {

trace_cpufreq_interactive_notyet(data, cpu_load,

pcpu->target_freq, pcpu->policy->cur, new_freq);

spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

goto rearm;

}

* Do not scale below floor_freq unless we have been at or above the

* floor frequency for the minimum sample time since last validated.

//当new_freq < pcpu->floor_freq，并且两次floor_validate_time的间隔小于min_sample_time，此时不需要更新频率

max_fvtime = max(pcpu->floor_validate_time, pcpu->local_fvtime);

if (!is_notif && new_freq < pcpu->floor_freq &&

pcpu->target_freq >= pcpu->policy->cur) {

if (now - max_fvtime < tunables->min_sample_time) {

trace_cpufreq_interactive_notyet(

data, cpu_load, pcpu->target_freq,

pcpu->policy->cur, new_freq);

spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

goto rearm;

}

* Update the timestamp for checking whether speed has been held at

* or above the selected frequency for a minimum of min_sample_time,

* if not boosted to hispeed_freq. If boosted to hispeed_freq then we

* allow the speed to drop as soon as the boostpulse duration expires

* (or the indefinite boost is turned off).

//不超频，并且new_freq>hispeed_freq

if (!tunables->boosted || new_freq > tunables->hispeed_freq) {

pcpu->floor_freq = new_freq;

if (pcpu->target_freq >= pcpu->policy->cur ||

new_freq >= pcpu->policy->cur)

pcpu->local_fvtime = now;

}

if (new_freq == pcpu->policy->max)

pcpu->max_freq_hyst_start_time = now;

if (pcpu->target_freq == new_freq) {

trace_cpufreq_interactive_already(

data, cpu_load, pcpu->target_freq,

pcpu->policy->cur, new_freq);

spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

goto rearm;

}

trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq,

pcpu->policy->cur, new_freq);

pcpu->target_freq = new_freq;//更新本次计算后最终的目标频率,更新之前表示上次选频的目标频率

spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

spin_lock_irqsave(&speedchange_cpumask_lock, flags);

cpumask_set_cpu(data, &speedchange_cpumask);//当前cpu加入调频cpu列表

spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);

wake_up_process(speedchange_task);//唤醒调频处理线程,这个线程里面进行真正的调频处理

rearm:

if (!timer_pending(&pcpu->cpu_timer))//跳过此次调频操作，重启定时器

cpufreq_interactive_timer_resched(data, false);

exit:

up_read(&pcpu->enable_sem);

return;

}

static u64 update_load(int cpu)

{

struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);

struct cpufreq_interactive_tunables *tunables =

pcpu->policy->governor_data;

u64 now;

u64 now_idle;

unsigned int delta_idle;

unsigned int delta_time;

u64 active_time;

//now_idle:cpu自启动后总的idle时间,now:总开机时间,当前时间戳

now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);

delta_idle = (unsigned int)(now_idle - pcpu->time_in_idle);//上次计算workload时idle差值

delta_time = (unsigned int)(now - pcpu->time_in_idle_timestamp);//距离上次计算负载的时间间隔

if (delta_time <= delta_idle)

active_time = 0;

else

active_time = delta_time - delta_idle;//cpu 活跃时间

//一个定时周期的cpu活跃时间跟当前运行频率的乘积，

//每次定时器启动时cputime_speedadj被设置为0，

//此值可以表示定时周期内cpu工作的总输出

pcpu->cputime_speedadj += active_time * pcpu->policy->cur;

pcpu->time_in_idle = now_idle;//idle时间戳

pcpu->time_in_idle_timestamp = now;//time_in_idle_timestamp:每次计算负载的时间戳

return now;

}

choose_freq函数用来选频，使选频后的系统workload小于或等于target load

核心思想是：选择最小的频率来满足target load

loadadjfreq一段时间内工作量

static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu,

unsigned int loadadjfreq)

{

unsigned int freq = pcpu->policy->cur;//当前频率

unsigned int prevfreq, freqmin, freqmax;

unsigned int tl;//target load

int index;

freqmin = 0;

freqmax = UINT_MAX;

do {

prevfreq = freq;

//计算当前频率对应的workload

tl = freq_to_targetload(pcpu->policy->governor_data, freq);

* Find the lowest frequency where the computed load is less

* than or equal to the target load.

//从freq_table中获取最优频率对应的index，取大于等于loadadjfreq / tl　(target freq)的最小值

if (cpufreq_frequency_table_target(

pcpu->policy, pcpu->freq_table, loadadjfreq / tl,

CPUFREQ_RELATION_L, &index))

break;

freq = pcpu->freq_table[index].frequency;

if (freq > prevfreq) {//提高频率

/* The previous frequency is too low. */

freqmin = prevfreq;

if (freq >= freqmax) {

* Find the highest frequency that is less

* than freqmax.

if (cpufreq_frequency_table_target(

pcpu->policy, pcpu->freq_table,

freqmax - 1, CPUFREQ_RELATION_H,

&index))

break;

freq = pcpu->freq_table[index].frequency;

if (freq == freqmin) {

* The first frequency below freqmax

* has already been found to be too

* low. freqmax is the lowest speed

* we found that is fast enough.

freq = freqmax;

break;

}

} else if (freq < prevfreq) {

/* The previous frequency is high enough. */

freqmax = prevfreq;

if (freq <= freqmin) {

* Find the lowest frequency that is higher

* than freqmin.

if (cpufreq_frequency_table_target(

pcpu->policy, pcpu->freq_table,

freqmin + 1, CPUFREQ_RELATION_L,

&index))

break;

freq = pcpu->freq_table[index].frequency;

* If freqmax is the first frequency above

* freqmin then we have already found that

* this speed is fast enough.

if (freq == freqmax)

break;

}

/* If same frequency chosen as previous then done. */

} while (freq != prevfreq);

return freq;

}

CPU的频率设置为所有CPU的pcpu->target_freq值中最大的那一个

static int cpufreq_interactive_speedchange_task(void *data)

{

unsigned int cpu;

cpumask_t tmp_mask;

unsigned long flags;

struct cpufreq_interactive_cpuinfo *pcpu;

while (1) {

set_current_state(TASK_INTERRUPTIBLE);

spin_lock_irqsave(&speedchange_cpumask_lock, flags);

if (cpumask_empty(&speedchange_cpumask)) {//没有需要调频的cpu，调度执行其他task

spin_unlock_irqrestore(&speedchange_cpumask_lock,

flags);

schedule();

if (kthread_should_stop())

break;

spin_lock_irqsave(&speedchange_cpumask_lock, flags);

}

set_current_state(TASK_RUNNING);

tmp_mask = speedchange_cpumask;

cpumask_clear(&speedchange_cpumask);//清空cpumask

spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);

for_each_cpu(cpu, &tmp_mask) {//遍历所有需要调频cpu

unsigned int j;

unsigned int max_freq = 0;

struct cpufreq_interactive_cpuinfo *pjcpu;

u64 hvt = ~0ULL, fvt = 0;

pcpu = &per_cpu(cpuinfo, cpu);

if (!down_read_trylock(&pcpu->enable_sem))

continue;

if (!pcpu->governor_enabled) {

up_read(&pcpu->enable_sem);

continue;

}

//如果多个cpu公用一个policy,找到公用policy 的cpu 目标频率最大的值

for_each_cpu(j, pcpu->policy->cpus) {

pjcpu = &per_cpu(cpuinfo, j);

fvt = max(fvt, pjcpu->local_fvtime);

if (pjcpu->target_freq > max_freq) {

max_freq = pjcpu->target_freq;

hvt = pjcpu->local_hvtime;

} else if (pjcpu->target_freq == max_freq) {

hvt = min(hvt, pjcpu->local_hvtime);

}

for_each_cpu(j, pcpu->policy->cpus) {//写法不够简洁

pjcpu = &per_cpu(cpuinfo, j);

pjcpu->floor_validate_time = fvt;

}

//修改管理policy的cpu clock，共用同一个policy的cpu clock会一起改变

if (max_freq != pcpu->policy->cur) {

//调用驱动调频的接口

__cpufreq_driver_target(pcpu->policy,

max_freq,

CPUFREQ_RELATION_H);

for_each_cpu(j, pcpu->policy->cpus) {//更新cpu的hispeed_validate_time

pjcpu = &per_cpu(cpuinfo, j);

pjcpu->hispeed_validate_time = hvt;

}

trace_cpufreq_interactive_setspeed(cpu,

pcpu->target_freq,

pcpu->policy->cur);

up_read(&pcpu->enable_sem);

}

return 0;

}

莈莈

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
linux 内核动态调频,linux cpufreq interactive调频代码实现下

上一篇文章简单介绍了cpufreq初始化过程，最后会调用到cpufreq_init_policy，在这里面会启动对应的governor。static void cpufreq_init_policy(struct cpufreq_policy *policy){struct cpufreq_policy new_policy;int ret = 0;memcpy(&new_policy, ...
复制链接

扫一扫