【Linux CPUFreq模块】

wulaladamowang

已于 2022-04-11 13:04:43 修改

阅读量5.2k

点赞数 18

文章标签： linux android

于 2022-04-03 00:54:46 首次发布

本文链接：https://blog.csdn.net/wulaladamowang/article/details/123931813

版权

CPUFreq

简介

CPU的硬件特性决定了这个CPU的最高和最低工作频率，它们在sysfs文件系统中用cpuinfo_xxx_freq来表示；我们可以在这个范围内再次定义出一个软件的调节范围，它们用scaling_xxx_freq来表示；根据具体的硬件平台的不同，我们还需要提供一个频率表，这个频率表规定了cpu可以工作的频率值，其受限于cpuinfo_xxx_freq;通过上述数值，cpuFreq系统可以根据当前cpu的负载情况从频率表中选择一个合适的频率供cpu使用，从而实现性能与功耗的要求。
选择合适的频率需要使用governor来实现，对于硬件的操作使用scaling_driver来完成。CPUFreq将一些与平台和具体的调频策略无关的代码抽象出来，形成cpuFreq Core.在cpu中通过cpu_policy记录了当前cpu的一些硬件信息以及软件信息，包括对应的governor;内核中的其它模块需要得到通知信息时，则通过cpufreq notifiers来完成。

模块架构

cpufreq core
cpufreq governor
cpufreq driver

代码

CPUFreq 核心数据结构

struct cpufreq_policy {
	/* CPUs sharing clock, require sw coordination */
	cpumask_var_t		cpus;	/* note:Online CPUs only ----------------------------------------------------*/
	cpumask_var_t		related_cpus; /* note:Online + Offline CPUs -----------------------------------------*/
	cpumask_var_t		real_cpus; /* Related and present */

	unsigned int		shared_type; /* ACPI: ANY or ALL affected CPUs
						should set cpufreq */
	unsigned int		cpu;    /* note:cpu managing this policy, must be online ---------------------------*/

	struct clk		*clk;
	struct cpufreq_cpuinfo	cpuinfo;/* note:see above 记录cpu硬件能够支持的最小和最大的频率以及切换延迟信息--------*/

	unsigned int		min;    /* note：policy 能够使用的最小频率 in kHz -------------------------------------*/
	unsigned int		max;    /* in kHz ------------------------------------------------------------------*/
	unsigned int		cur;    /* in kHz, only needed if cpufreq governors are used -----------------------*/
	unsigned int		restore_freq; /* = policy->cur before transition */
	unsigned int		suspend_freq; /* freq to set during suspend */

	unsigned int		policy; /* see above 该变量只有在CPUFREQ_POLICY_POWERSAVE和CPUFREQ_POLICY_PERFORMANCE，该变量只有在当调频驱动支持setpolicy回调函数的时候有效，这时候驱动根据policy变量的值来决定系统的工作频率或者状态，如果调频驱动支持driver的target回调，则频率由相应的governor来决定*/
	unsigned int		last_policy; /* policy before unplug */
	struct cpufreq_governor	*governor; /* note:see below --------------------------------------------------*/
	void			*governor_data;/*note: 当前policy所使用的的cpufreq_governor的数据结构以及其上下文数据 -------*/
	char			last_governor[CPUFREQ_NAME_LEN]; /* last governor used */

	struct work_struct	update; /* note:if update_policy() needs to be called, but you're in IRQ context -*/

	struct cpufreq_user_policy user_policy;/*因为特殊原因需要修改policy的参数，然后在适当的时候恢复原有的参数，如温控保护，使用user_policy保存原始的参数(min,max)-------------------------------------------------------------------*/
	struct cpufreq_frequency_table	*freq_table;/* note --------------------------------------------------*/
	enum cpufreq_table_sorting freq_table_sorted;

	struct list_head        policy_list;
	struct kobject		kobj;/* note: 该policy在sysfs中对应的kobj对象---------------------------------------*/
	struct completion	kobj_unregister;

	/*
	 * The rules for this semaphore:
	 * - Any routine that wants to read from the policy structure will
	 *   do a down_read on this semaphore.
	 * - Any routine that will write to the policy structure and/or may take away
	 *   the policy altogether (eg. CPU hotplug), will hold this lock in write
	 *   mode before doing so.
	 */
	struct rw_semaphore	rwsem;

	/* note:
	 * Fast switch flags:
	 * - fast_switch_possible should be set by the driver if it can
	 *   guarantee that frequency can be changed on any CPU sharing the
	 *   policy and that the change will affect all of the policy CPUs then.
	 * - fast_switch_enabled is to be set by governors that support fast
	 *   frequency switching with the help of cpufreq_enable_fast_switch().
	 */
	bool			fast_switch_possible;/*----------------------------------------------------------------*/
	bool			fast_switch_enabled;

	/* note
	 * Preferred average time interval between consecutive invocations(连续调用) of
	 * the driver to set the frequency for this policy.  To be set by the
	 * scaling driver (0, which is the default, means no preference).
	 */
	unsigned int		transition_delay_us;/*-------------------------------------------------------------*/

	/*
	 * Remote DVFS flag (Not added to the driver structure as we don't want
	 * to access another structure from scheduler hotpath).
	 *
	 * Should be set if CPUs can do DVFS on behalf of other CPUs from
	 * different cpufreq policies.
	 */
	bool			dvfs_possible_from_any_cpu;

	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
	unsigned int cached_target_freq;
	int cached_resolved_idx;

	/* Synchronization for frequency transitions */
	bool			transition_ongoing; /* Tracks transition status */
	spinlock_t		transition_lock;
	wait_queue_head_t	transition_wait;
	struct task_struct	*transition_task; /* Task which is doing the transition */

	/* cpufreq-stats */
	struct cpufreq_stats	*stats;

	/* For cpufreq driver's internal use */
	void			*driver_data;
};

struct cpufreq_governor {
	char	name[CPUFREQ_NAME_LEN];
	int	(*init)(struct cpufreq_policy *policy);
	void	(*exit)(struct cpufreq_policy *policy);
	int	(*start)(struct cpufreq_policy *policy);
	void	(*stop)(struct cpufreq_policy *policy);
	void	(*limits)(struct cpufreq_policy *policy);
	ssize_t	(*show_setspeed)	(struct cpufreq_policy *policy,
					 char *buf);
	int	(*store_setspeed)	(struct cpufreq_policy *policy,
					 unsigned int freq);
	/* For governors which change frequency dynamically by themselves */
	bool			dynamic_switching;
	struct list_head	governor_list;
	struct module		*owner;
};

struct cpufreq_driver {
	char		name[CPUFREQ_NAME_LEN];
	u8		flags;
	void		*driver_data;

	/* needed by all drivers */
	int		(*init)(struct cpufreq_policy *policy);//core对驱动进行必要的初始化操作---------------------------*/
	int		(*verify)(struct cpufreq_policy *policy);// core通过调用该回调函数检查policy的参数是否被驱动支持----*/

	/* define one out of two */
	int		(*setpolicy)(struct cpufreq_policy *policy);// 如果不支持governor选择合适的运行频率，则实现该函数，系统只能支持CPUFREQ_POLICY_POWERSAVE和CPUFREQ_POLICY_PERFORMANCE这两种工作频率; 反之，调用target设定的回调，注意4.19的已经废弃，进行设定governor所需要的频率--------------------------*/；

	/*
	 * On failure, should always restore frequency to policy->restore_freq
	 * (i.e. old freq).
	 */
	int		(*target)(struct cpufreq_policy *policy,
				  unsigned int target_freq,
				  unsigned int relation);	/* Deprecated */
	int		(*target_index)(struct cpufreq_policy *policy,
					unsigned int index);
	unsigned int	(*fast_switch)(struct cpufreq_policy *policy,
				       unsigned int target_freq);

	/*
	 * Caches and returns the lowest driver-supported frequency greater than
	 * or equal to the target frequency, subject to any driver limitations.
	 * Does not set the frequency. Only to be implemented for drivers with
	 * target().
	 */
	unsigned int	(*resolve_freq)(struct cpufreq_policy *policy,
					unsigned int target_freq);

	/* 是否选择中间过渡频率，以及过渡频率是否driver会立即执行
	 * Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION
	 * unset.
	 *
	 * get_intermediate should return a stable intermediate frequency
	 * platform wants to switch to and target_intermediate() should set CPU
	 * to to that frequency, before jumping to the frequency corresponding
	 * to 'index'. Core will take care of sending notifications and driver
	 * doesn't have to handle them in target_intermediate() or
	 * target_index().
	 *
	 * Drivers can return '0' from get_intermediate() in case they don't
	 * wish to switch to intermediate frequency for some target frequency.
	 * In that case core will directly call ->target_index().
	 */
	unsigned int	(*get_intermediate)(struct cpufreq_policy *policy,
					    unsigned int index);
	int		(*target_intermediate)(struct cpufreq_policy *policy,
					       unsigned int index);

	/* should be defined, if possible */
	unsigned int	(*get)(unsigned int cpu);

	/* optional */
	int		(*bios_limit)(int cpu, unsigned int *limit);

	int		(*exit)(struct cpufreq_policy *policy);
	void		(*stop_cpu)(struct cpufreq_policy *policy);
	int		(*suspend)(struct cpufreq_policy *policy);
	int		(*resume)(struct cpufreq_policy *policy);

	/* Will be called after the driver is fully initialized */
	void		(*ready)(struct cpufreq_policy *policy);

	struct freq_attr **attr;

	/* platform specific boost support code */
	bool		boost_enabled;
	int		(*set_boost)(int state);
};

CPUFreq governor核心数据结构

//计算cpu负载需要使用到的一些辅助变量整合在了一起
/* Per cpu structures   demand based switching*/
struct cpu_dbs_info {
	u64 prev_cpu_idle;
	u64 prev_update_time;
	u64 prev_cpu_nice;
	/*
	 * Used to keep track of load in the previous interval. However, when
	 * explicitly set to zero, it is used as a flag to ensure that we copy
	 * the previous load to the current interval only once, upon the first
	 * wake-up from idle.
	 */
	unsigned int prev_load;
	struct update_util_data update_util;
	struct policy_dbs_info *policy_dbs;
};

cpufreq notifiers

CPUFreq的通知系统使用了内核的标准通知接口。它对外提供了两个通知事件：policy通知和transition通知。
policy通知用于通知其它模块cpu的policy需要改变，每次policy改变时，该通知链上的回调将会用不同的事件参数被调用3次，分别是：
- CPUFREQ_ADJUST 只要有需要，所有的被通知者可以在此时修改policy的限制信息，比如温控系统可能会修改在大允许运行的频率；
- CPUFREQ_INCOMPATIBLE 只是为了避免硬件错误的情况下，可以在该通知中修改policy的限制信息；
- CPUFREQ_NOTIFY 真正切换policy前，该通知会发往所有的被通知者；
transition通知链用于在驱动实施调整cpu的频率时，用于通知相关的注册者。每次调整频率时，该通知会发出两次通知事件：
- CPUFREQ_PRECHANGE 调整前的通知；
- CPUFREQ_POSTCHANGE 完成调整后的通知；
当检测到因系统进入suspend而造成频率被改变时，以下通知消息会被发出：CPUFREQ_RESUMECHANGE；

CPUFreq Core层

初始化

struct kobject *cpufreq_global_kobject;
EXPORT_SYMBOL(cpufreq_global_kobject);//将函数或者变量导出到内核空间，使得内核中的所有代码都可以使用
static int __init cpufreq_core_init(void)
{
	if (cpufreq_disabled()) //1
		return -ENODEV;

	cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj);//2
	BUG_ON(!cpufreq_global_kobject);

	register_syscore_ops(&cpufreq_syscore_ops);//3

	return 0;
}

module_param(off, int, 0444);//4
core_initcall(cpufreq_core_init);

cpufreq_disabled()返回off的值,即freq模块是否被启用，off默认是1，module_param可以在模块加载时给予off赋值；
cpu_subsys是在系统中已经构建好的全局变量，dev_root记录了其在bus中对应的device结构，每个设备都有对应的struct device;上述在/sys/devices/system/cpu目录下创建一个名为cpufreq的kobject；
注册一个回调函数吗，在关机的时候且cpu0外的cpu都停止时关掉cpufreq，确保cpufreq没有持有任何锁和信号量？首先关掉每一个governor,调用cpufreq driver 的sudpended的函数；设置cpufreq_suspend= true；

注册cpufreq_governor

int cpufreq_register_governor(struct cpufreq_governor *governor)
{
	int err;

	if (!governor)
		return -EINVAL;

	if (cpufreq_disabled())
		return -ENODEV;

	mutex_lock(&cpufreq_governor_mutex);

	err = -EBUSY;
	if (!find_governor(governor->name)) {
		err = 0;
		list_add(&governor->governor_list, &cpufreq_governor_list);//1
	}

	mutex_unlock(&cpufreq_governor_mutex);
	return err;
}

通过将governore添加到cpufreq_governor_list链表之上；

注册cpufreq_driver驱动

/**
 * cpufreq_register_driver - register a CPU Frequency driver
 * @driver_data: A struct cpufreq_driver containing the values#
 * submitted by the CPU Frequency driver.
 *
 * Registers a CPU Frequency driver to this core code. This code
 * returns zero on success, -EEXIST when another driver got here first
 * (and isn't unregistered in the meantime).
 *
 */
int cpufreq_register_driver(struct cpufreq_driver *driver_data)
{
	unsigned long flags;
	int ret;

	if (cpufreq_disabled())
		return -ENODEV;

	if (!driver_data || !driver_data->verify || !driver_data->init ||
	    !(driver_data->setpolicy || driver_data->target_index ||
		    driver_data->target) ||
	     (driver_data->setpolicy && (driver_data->target_index ||
		    driver_data->target)) ||
	     (!!driver_data->get_intermediate != !!driver_data->target_intermediate))//1
		return -EINVAL;

	pr_debug("trying to register driver %s\n", driver_data->name);

	/* Protect against concurrent CPU online/offline. */
	cpus_read_lock();

	write_lock_irqsave(&cpufreq_driver_lock, flags);
	if (cpufreq_driver) {//2
		write_unlock_irqrestore(&cpufreq_driver_lock, flags);
		ret = -EEXIST;
		goto out;
	}
	cpufreq_driver = driver_data;
	write_unlock_irqrestore(&cpufreq_driver_lock, flags);

	if (driver_data->setpolicy)
		driver_data->flags |= CPUFREQ_CONST_LOOPS;//3

	if (cpufreq_boost_supported()) {// 4
		ret = create_boost_sysfs_file();
		if (ret)
			goto err_null_driver;
	}

	ret = subsys_interface_register(&cpufreq_interface);//5
	if (ret)
		goto err_boost_unreg;

	if (!(cpufreq_driver->flags & CPUFREQ_STICKY) &&
	    list_empty(&cpufreq_policy_list)) {
		/* if all ->init() calls failed, unregister */
		ret = -ENODEV;
		pr_debug("%s: No CPU initialized for driver %s\n", __func__,
			 driver_data->name);
		goto err_if_unreg;
	}

	ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN,
						   "cpufreq:online",
						   cpuhp_cpufreq_online,
						   cpuhp_cpufreq_offline);//6
	if (ret < 0)
		goto err_if_unreg;
	hp_online = ret;
	ret = 0;

	pr_debug("driver %s up and running\n", driver_data->name);
	goto out;

err_if_unreg:
	subsys_interface_unregister(&cpufreq_interface);
err_boost_unreg:
	remove_boost_sysfs_file();
err_null_driver:
	write_lock_irqsave(&cpufreq_driver_lock, flags);
	cpufreq_driver = NULL;
	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
out:
	cpus_read_unlock();
	return ret;
}

cpufreq_driver驱动中必须要实现的函数功能；
cpufreq_driver驱动要求只能实现一次驱动注册；
告诉内核loops_per_jiffy或者其他的内核constants不会因为CPU频率的变化而变化;
查看驱动注册是否支持boost，如果支持就在cpufreq下建立boost节点；
遍历cpu，给每个cpu设立一个cpufreq_policy;通过该步骤，每个cpu目录下会有一个cpufreq的节点，其是cpufreq目录下的各种policy的的链接；
cpu hotplug时候的回调函数;后面的两个参数为回调函数；

注册cpufreq_driver驱动：subsys_interface_register() 用来建立cpufreq_policy 包括参数的设置以及governor的选择

subsys_interface_register(struct subsys_interface *sif):调用相应的子系统接口的add_dev函数，循环，将子系统的设备进行某项操作：

// 调用相应的子系统接口的add_dev函数，循环
int subsys_interface_register(struct subsys_interface *sif)
{
	struct bus_type *subsys;
	struct subsys_dev_iter iter;
	struct device *dev;

	if (!sif || !sif->subsys)
		return -ENODEV;

	subsys = bus_get(sif->subsys);
	if (!subsys)
		return -EINVAL;

	mutex_lock(&subsys->p->mutex);
	list_add_tail(&sif->node, &subsys->p->interfaces);
	if (sif->add_dev) {
		subsys_dev_iter_init(&iter, subsys, NULL, NULL);
		while ((dev = subsys_dev_iter_next(&iter)))
			sif->add_dev(dev, sif);//调用add_dev函数
		subsys_dev_iter_exit(&iter);
	}
	mutex_unlock(&subsys->p->mutex);

	return 0;
}
EXPORT_SYMBOL_GPL(subsys_interface_register);

cpufreq_interface

static struct subsys_interface cpufreq_interface = {
	.name		= "cpufreq",
	.subsys		= &cpu_subsys,
	.add_dev	= cpufreq_add_dev,
	.remove_dev	= cpufreq_remove_dev,
};

subsys_interface_register(struct subsys_interface *sif)，调用子系统接口的add_dev函数

/** 将cpufreq加入到某个cpu目录下
 * cpufreq_add_dev - the cpufreq interface for a CPU device.
 * @dev: CPU device.
 * @sif: Subsystem interface structure pointer (not used)
 */
static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
{
	struct cpufreq_policy *policy;
	unsigned cpu = dev->id;
	int ret;

	dev_dbg(dev, "%s: adding CPU%u\n", __func__, cpu);

	if (cpu_online(cpu)) {
		ret = cpufreq_online(cpu);// 2.2
		if (ret)
			return ret;
	}

	/* Create sysfs link on CPU registration */
	policy = per_cpu(cpufreq_cpu_data, cpu);//获得cpufreq_policy
	if (policy)
		add_cpu_dev_symlink(policy, cpu);//2.1 将policy与cpu下的建立cpufreq相互关联

	return 0;
}

2.1 add_cpu_dev_symlink():将policy与cpu下的建立cpufreq相互关联

static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
{
	struct device *dev = get_cpu_device(cpu);

	if (!dev)
		return;

	if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))//2.1.1
		return;

	dev_dbg(dev, "%s: Adding symlink\n", __func__);
	if (sysfs_create_link(&dev->kobj, &policy->kobj, "cpufreq"))// (kobj, target, name)
		dev_err(dev, "cpufreq symlink creation failed\n");
}

2.1.1 cpumask_test_and_set_cpu()检测并且将cpu放在cpu

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
 *
 * test_and_set_bit wrapper for cpumasks.
 */
static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
	return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

2.2 static int cpufreq_online(unsigned int cpu):将cpufreq的policy与sysfs用户接口，governor以及驱动相互关联

static int cpufreq_online(unsigned int cpu)
{
	struct cpufreq_policy *policy;
	bool new_policy;
	unsigned long flags;
	unsigned int j;
	int ret;

	pr_debug("%s: bringing CPU%u online\n", __func__, cpu);

	/* Check if this CPU already has a policy to manage it */
	policy = per_cpu(cpufreq_cpu_data, cpu);
	if (policy) {
		WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus));
		if (!policy_is_inactive(policy))// 判断policy的cpus是否为空，即是在线的cpu是否为空；
			return cpufreq_add_policy_cpu(policy, cpu);// policy已经分配好 1. 暂停governor(首先判断cpufreq_driver是否有target或者target_index函数，只有有，才支持governor调频) 2.设置cpu到policy->cpus;3.启动governor

		/* This is the only online CPU for the policy.  Start over. 当policy的cpus为空时*/
		new_policy = false;
		down_write(&policy->rwsem);
		policy->cpu = cpu;
		policy->governor = NULL;
		up_write(&policy->rwsem);
	} else { // policy还没有分配，就分配内存空间
		new_policy = true;
		policy = cpufreq_policy_alloc(cpu);//分配内存，在cpufreq下创建节点，设立管理cpu为当前cpu
		if (!policy)
			return -ENOMEM;
	}

	cpumask_copy(policy->cpus, cpumask_of(cpu));//同一簇的cpu使用同一个policy

	/* call driver. From then on the cpufreq must be able
	 * to accept all calls to ->verify and ->setpolicy for this CPU
	 */
	ret = cpufreq_driver->init(policy);//调用驱动初始化policy
	if (ret) {
		pr_debug("initialization failed\n");
		goto out_free_policy;
	}

	ret = cpufreq_table_validate_and_sort(policy);// 根据频率进行policy排序
	if (ret)
		goto out_exit_policy;

	down_write(&policy->rwsem);

	if (new_policy) {
		/* related_cpus should at least include policy->cpus. */
		cpumask_copy(policy->related_cpus, policy->cpus);// releated_cpus为同一个簇中的CPU
	}

	/*
	 * affected cpus must always be the one, which are online. We aren't
	 * managing offline cpus here.
	 */
	cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);//cpus为当前簇中在线的cpu

	if (new_policy) {
		policy->user_policy.min = policy->min;
		policy->user_policy.max = policy->max;

		for_each_cpu(j, policy->related_cpus) { // cpufreq_cpu_date就是指向policy
			per_cpu(cpufreq_cpu_data, j) = policy;
			add_cpu_dev_symlink(policy, j);// 为cpuj 创建policy节点
		}
	} else {
		policy->min = policy->user_policy.min;
		policy->max = policy->user_policy.max;
	}

	if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
		policy->cur = cpufreq_driver->get(policy->cpu);
		if (!policy->cur) {
			pr_err("%s: ->get() failed\n", __func__);
			goto out_destroy_policy;
		}
	}

	/*
	 * Sometimes boot loaders set CPU frequency to a value outside of
	 * frequency table present with cpufreq core. In such cases CPU might be
	 * unstable if it has to run on that frequency for long duration of time
	 * and so its better to set it to a frequency which is specified in
	 * freq-table. This also makes cpufreq stats inconsistent as
	 * cpufreq-stats would fail to register because current frequency of CPU
	 * isn't found in freq-table.
	 *
	 * Because we don't want this change to effect boot process badly, we go
	 * for the next freq which is >= policy->cur ('cur' must be set by now,
	 * otherwise we will end up setting freq to lowest of the table as 'cur'
	 * is initialized to zero).
	 *
	 * We are passing target-freq as "policy->cur - 1" otherwise
	 * __cpufreq_driver_target() would simply fail, as policy->cur will be
	 * equal to target-freq.
	 */
	if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK)
	    && has_target()) {
		/* Are we running at unknown frequency ? */
		ret = cpufreq_frequency_table_get_index(policy, policy->cur);
		if (ret == -EINVAL) {
			/* Warn user and fix it */
			pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n",
				__func__, policy->cpu, policy->cur);
			ret = __cpufreq_driver_target(policy, policy->cur - 1,
				CPUFREQ_RELATION_L);

			/*
			 * Reaching here after boot in a few seconds may not
			 * mean that system will remain stable at "unknown"
			 * frequency for longer duration. Hence, a BUG_ON().
			 */
			BUG_ON(ret);
			pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n",
				__func__, policy->cpu, policy->cur);
		}
	}

	if (new_policy) {
		ret = cpufreq_add_dev_interface(policy);//为policy创立sysfs系统节点
		if (ret)
			goto out_destroy_policy;

		cpufreq_stats_create_table(policy);//创建频率表等

		write_lock_irqsave(&cpufreq_driver_lock, flags);
		list_add(&policy->policy_list, &cpufreq_policy_list);//加入链表
		write_unlock_irqrestore(&cpufreq_driver_lock, flags);
	}

	ret = cpufreq_init_policy(policy);//获得governor
	if (ret) {
		pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",
		       __func__, cpu, ret);
		/* cpufreq_policy_free() will notify based on this */
		new_policy = false;
		goto out_destroy_policy;
	}

	up_write(&policy->rwsem);

	kobject_uevent(&policy->kobj, KOBJ_ADD);

	/* Callback for handling stuff after policy is ready */
	if (cpufreq_driver->ready)
		cpufreq_driver->ready(policy);

	pr_debug("initialization complete\n");

	return 0;

out_destroy_policy:
	for_each_cpu(j, policy->real_cpus)
		remove_cpu_dev_symlink(policy, get_cpu_device(j));

	up_write(&policy->rwsem);

out_exit_policy:
	if (cpufreq_driver->exit)
		cpufreq_driver->exit(policy);

out_free_policy:
	cpufreq_policy_free(policy);
	return ret;
}

注册通知链

/**
 *	cpufreq_register_notifier - register a driver with cpufreq
 *	@nb: notifier function to register
 *      @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
 *
 *	Add a driver to one of two lists: either a list of drivers that
 *      are notified about clock rate changes (once before and once after
 *      the transition), or a list of drivers that are notified about
 *      changes in cpufreq policy.
 *
 *	This function may sleep, and has the same return conditions as
 *	blocking_notifier_chain_register.
 */
int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
{
	int ret;

	if (cpufreq_disabled())
		return -EINVAL;

	switch (list) {// 定义两条通知链
	case CPUFREQ_TRANSITION_NOTIFIER:
		mutex_lock(&cpufreq_fast_switch_lock);

		if (cpufreq_fast_switch_count > 0) {
			mutex_unlock(&cpufreq_fast_switch_lock);
			return -EBUSY;
		}
		ret = srcu_notifier_chain_register(
				&cpufreq_transition_notifier_list, nb);
		if (!ret)
			cpufreq_fast_switch_count--;

		mutex_unlock(&cpufreq_fast_switch_lock);
		break;
	case CPUFREQ_POLICY_NOTIFIER:
		ret = blocking_notifier_chain_register(
				&cpufreq_policy_notifier_list, nb);
		break;
	default:
		ret = -EINVAL;
	}

	return ret;
}
EXPORT_SYMBOL(cpufreq_register_notifier);

policy通知用于通知其它模块cpu的policy需要改变，每次policy改变时，该通知链上的回调将会用不同的事件参数被调用3次，分别是：
- CPUFREQ_ADJUST 只要有需要，所有的被通知者可以在此时修改policy的限制信息，比如温控系统可能会修改在大允许运行的频率；
- CPUFREQ_INCOMPATIBLE 只是为了避免硬件错误的情况下，可以在该通知中修改policy的限制信息；
- CPUFREQ_NOTIFY 真正切换policy前，该通知会发往所有的被通知者；
transition通知链用于在驱动实施调整cpu的频率时，用于通知相关的注册者。每次调整频率时，该通知会发出两次通知事件：
- CPUFREQ_PRECHANGE 调整前的通知；
- CPUFREQ_POSTCHANGE 完成调整后的通知；

CPUFreq driver层

简介

cpufreq driver主要完成平台相关的CPU频率/电压的控制；主要是定义一个struct cpufreq_driver变量，填充必要的字段，并根据平台的特性，实现其中的回调函数；然后注册到系统中去。

cpufreq_driver的 init函数

代码

从device tree中获取对应的clock，regulator配置最小最大频率等；注册cpufreq_driver驱动；加载该模块时候会执行该函数

cpufreq_driver的verity函数

代码

确定policy->min和policy->max之间至少有一个有效的频率

cpufreq_driver的get函数

代码

获得当前cpu的频率

cpufreq_driver的target系列函数

代码

实际调频调压的操作者：1. 调压调频设置参数,主要是进一步调用对应的cpu的调频调压函数，进一步实现设置频率以及电压；2. 修改policy的对应参数，修改时间，对应的涉及cpu的参数；3. 发出通知链告诉调频过程；

CPUFreq_governor

简介

公共逻辑代码位置：cpufreq_governor.c
检测系统的负载情况，根据当前的负载，选择可供使用的频率

常见的governor

Performance: 性能优先的governor，直接将cpu频率设置为policy->{min,max}中的最大值。一般会被选做默认的governor以节省系统启动时间,之后再切换；
功耗优先的governor，直接将cpu频率设置为policy->{min,max}中的最小值；
Userspace: 由用户空间程序通过scaling_setspeed文件修改频率。一般用作调试；
Ondemand：根据CPU的当前使用率，动态的调节CPU频率；
interactive: 交互式动态调节CPU频率，与Ondemand类似，由谷歌开发并广泛使用于手机平板等设备上;
schedutil:利用负载回调机制，schedutil将自己的调频策略注册到hook中，在负载变化时会调用相应的调频策略甚至执行调频动作；优点是可以将scheduler与调频建立更加紧密的联系，同时提高了性能和功耗的表现；

schedutil调频策略 cpufreq_schedutil.c kernel5.0

调频策略示意图（蜗蜗科技）

注释
- sysfs: 用户接口，一些用户参数设置；暴露参数；
- scheduler负载跟踪；
- schedutil：实际的目标调频的获得；

schedutil的初始化以及启动

static int sugov_init(struct cpufreq_policy *policy)
{
	struct sugov_policy *sg_policy;
	struct sugov_tunables *tunables;
	int ret = 0;

	/* State should be equivalent to EXIT */
	if (policy->governor_data)// 是否已经绑定----------------------------------1
		return -EBUSY;
-
	cpufreq_enable_fast_switch(policy); //启用快速切换的功能-------------------2

	sg_policy = sugov_policy_alloc(policy);//分配内存-------------------------3
	if (!sg_policy) {
		ret = -ENOMEM;
		goto disable_fast_switch;
	}

	ret = sugov_kthread_create(sg_policy);//创建并绑定线程，set_cpu_allowed---4
	if (ret)
		goto free_sg_policy;

	mutex_lock(&global_tunables_lock);

	if (global_tunables) {//创建用户层接口-----------------------------------5
		if (WARN_ON(have_governor_per_policy())) {
			ret = -EINVAL;
			goto stop_kthread;
		}
		policy->governor_data = sg_policy;
		sg_policy->tunables = global_tunables;

		gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);//注册回调函数--6
		goto out;
	}

	tunables = sugov_tunables_alloc(sg_policy);//无则创建-------------7
	if (!tunables) {
		ret = -ENOMEM;
		goto stop_kthread;
	}

	tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);//切换延迟,policy存在则使用policy，没有则使用cpuinfo------8

	policy->governor_data = sg_policy;
	sg_policy->tunables = tunables;

	ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
				   get_governor_parent_kobj(policy), "%s",
				   schedutil_gov.name);//创建一个节点------------------9
	if (ret)
		goto fail;

out:
	mutex_unlock(&global_tunables_lock);
	return 0;

fail:
	kobject_put(&tunables->attr_set.kobj);
	policy->governor_data = NULL;
	sugov_tunables_free(tunables);

stop_kthread:
	sugov_kthread_stop(sg_policy);
	mutex_unlock(&global_tunables_lock);

free_sg_policy:
	sugov_policy_free(sg_policy);

disable_fast_switch:
	cpufreq_disable_fast_switch(policy);

	pr_err("initialization failed (error %d)\n", ret);//创建完成-------------------10
	return ret;
}

static int sugov_start(struct cpufreq_policy *policy)
{
	struct sugov_policy *sg_policy = policy->governor_data;
	void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
	unsigned int cpu;

	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
	sg_policy->last_freq_update_time	= 0;
	sg_policy->next_freq			= 0;
	sg_policy->work_in_progress		= false;
	sg_policy->limits_changed		= false;
	sg_policy->cached_raw_freq		= 0;

	sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);//检测driver flag是否与给定的相同---1

	for_each_cpu(cpu, policy->cpus) {//创建一个sugov_cpu结构体（遍历cpu_policy），每个cpu的单独信息，记录cpu及对应的sg_policy--2
		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);

		memset(sg_cpu, 0, sizeof(*sg_cpu));
		sg_cpu->cpu			= cpu;
		sg_cpu->sg_policy		= sg_policy;
	}

	if (policy_is_shared(policy))//根据标志 选择函数-----------------------3
		uu = sugov_update_shared;
	else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
		uu = sugov_update_single_perf;
	else
		uu = sugov_update_single_freq;

	for_each_cpu(cpu, policy->cpus) {
		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);//结构体变量--------------4

		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);//注册回调函数----5
	}
	return 0;
}

static void sugov_limits(struct cpufreq_policy *policy)
{
	struct sugov_policy *sg_policy = policy->governor_data;

	if (!policy->fast_switch_enabled) {
		mutex_lock(&sg_policy->work_lock);
		cpufreq_policy_apply_limits(policy);//根据sg_policy设定的数值更改policy的max以及min----1
		mutex_unlock(&sg_policy->work_lock);
	}

	sg_policy->limits_changed = true;//设置标志位-------------------2
}

schedutil调频的触发时机

CFS负载变化或者RT/DL任务状态更新就可以启动调频，通过调用cpufreq_update_util()函数即可实现，通过回调注册的sugov_update_shared 或者sugov_update_singled以及sugov_update_singled_perf进行调频;

/**
 * cpufreq_update_util - Take a note about CPU utilization changes.
 * @rq: Runqueue to carry out the update for.
 * @flags: Update reason flags.
 *
 * This function is called by the scheduler on the CPU whose utilization is
 * being updated.
 *
 * It can only be called from RCU-sched read-side critical sections.
 *
 * The way cpufreq is currently arranged requires it to evaluate the CPU
 * performance state (frequency/voltage) on a regular basis to prevent it from
 * being stuck in a completely inadequate performance level for too long.
 * That is not guaranteed to happen if the updates are only triggered from CFS
 * and DL, though, because they may not be coming in if only RT tasks are
 * active all the time (or there are RT tasks only).
 *
 * As a workaround for that issue, this function is called periodically by the
 * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
 * but that really is a band-aid.  Going forward it should be replaced with
 * solutions targeted more specifically at RT tasks.
 */
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
	struct update_util_data *data;

	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
						  cpu_of(rq)));
	if (data)
		data->func(data, rq_clock(rq), flags);// 调用注册链上的回调函数---------1
}

触发的时机：

DL任务更新；
进出rt队列；
当cfs运行队列中，cpu的任务负载发生变化时；

schedutil调频的决策和频率切换

调频函数在sugov_start()函数中调用
sugov_update_single函数

static void sugov_update_single(struct update_util_data *hook, u64 time,
				unsigned int flags)
{
	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
	unsigned long util, max;
	unsigned int next_f;
	unsigned int cached_freq = sg_policy->cached_raw_freq;

	sugov_iowait_boost(sg_cpu, time, flags);//更新ioboost status of a cpu，根据时间以及flags判定是否更新io_booost状态；
	sg_cpu->last_update = time;

	ignore_dl_rate_limit(sg_cpu, sg_policy);// 忽略dl任务的影响

	if (!sugov_should_update_freq(sg_policy, time))//判断更新时间
		return;

	util = sugov_get_util(sg_cpu);//获得cpu利用率，考虑dl， rt， cfs
	max = sg_cpu->max;
	util = sugov_iowait_apply(sg_cpu, time, util, max);//根据io boost重新计算util
	next_f = get_next_freq(sg_policy, util, max);//根据util与freq的映射关系找到对用的freq，1.25倍util
	/*
	 * Do not reduce the frequency if the CPU has not been idle
	 * recently, as the reduction is likely to be premature then.
	 */
	if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {//进一步判断
		next_f = sg_policy->next_freq;

		/* Restore cached freq as next_freq has changed */
		sg_policy->cached_raw_freq = cached_freq;
	}

	/*
	 * This code runs under rq->lock for the target CPU, so it won't run
	 * concurrently on two different CPUs for the same target and it is not
	 * necessary to acquire the lock in the fast switch case.
	 */
	if (sg_policy->policy->fast_switch_enabled) {
		sugov_fast_switch(sg_policy, time, next_f);
	} else {
		raw_spin_lock(&sg_policy->update_lock);
		sugov_deferred_update(sg_policy, time, next_f);//根据驱动以及flag，判断是否确定要更新并设定，在workqueue上进行排队
		raw_spin_unlock(&sg_policy->update_lock);
	}
}

sugov_next_freq_shared（）遍历policy对应的util，找最大利用率；

static void
sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{
	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
	unsigned int next_f;

	raw_spin_lock(&sg_policy->update_lock);

	sugov_iowait_boost(sg_cpu, time, flags);
	sg_cpu->last_update = time;

	ignore_dl_rate_limit(sg_cpu, sg_policy);

	if (sugov_should_update_freq(sg_policy, time)) {
		next_f = sugov_next_freq_shared(sg_cpu, time);

		if (sg_policy->policy->fast_switch_enabled)
			sugov_fast_switch(sg_policy, time, next_f);
		else
			sugov_deferred_update(sg_policy, time, next_f);
	}

	raw_spin_unlock(&sg_policy->update_lock);
}


static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
	struct cpufreq_policy *policy = sg_policy->policy;
	unsigned long util = 0, max = 1;
	unsigned int j;

	for_each_cpu(j, policy->cpus) {
		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
		unsigned long j_util, j_max;

		j_util = sugov_get_util(j_sg_cpu);
		j_max = j_sg_cpu->max;
		j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);

		if (j_util * max > j_max * util) {
			util = j_util;
			max = j_max;
		}
	}

	return get_next_freq(sg_policy, util, max);
}