目录
2.1.1.3 thermal_cooling_device注册
一、功能介绍
Linux的Thermal机制是基于Zone为单位的热管理机制,核心包括三个部分:获取区域温度的设备thermal_zone_device、区域降温的设备thermal_cooling_device、温控策略thermal_governor。thermal_governor从thermal_zone_device获取区域温度,然后根据当前温度,决定调用哪个降温设备来为该区域降温。
二、linux thermal框架
Linux Thermal框架可以分为Thermal Core、Thermal Governor、Thermal Cooling、Thermal Driver以及Thermal Device Tree五大部分。
Thermal Core:用于和user space、Thermal Governor、Thermal Driver交互。
Thermal Governor:主要包括gov_bang_bang、gov_fair_share、gov_power_allocator、gov_step_wise、gov_user_space等,最常用的为gov_power_allocator.
Thermal Cooling:主要包括cpufreq_cooling、cpuidle_cooling、devfreq_cooling等。
2.1 thermal core
内核将采集区域温度的设备抽象为结构体struct thermal_zone_device,主要成员包括:char type[]设备名称;int temperature当前温度;int last_temperature上次采集温度;struct thermal_governor *governor对应governor;int polling_delay温度采集时间间隔等等。其中struct thermal_zone_device_ops *ops是采集区域温度设备的操作抽象,包括绑定降温设备、获取设备温度等。
kernel/linux/thermal.h中定义了thermal_zone_device & thermal_zone_device_ops、thermal_governor、thermal_cooling_device & thermal_cooling_device_ops 结构体。
struct thermal_zone_device {
int id; // 设备的唯一标识符
char type[THERMAL_NAME_LENGTH]; // 设备名称
struct device device; // 设备相关联的struct device结构体
struct thermal_attr *trip_temp_attrs; // 温度触发器(trip)的温度属性链表
struct thermal_attr *trip_type_attrs; // 温度触发器的触发类型属性链表
struct thermal_attr *trip_hyst_attrs; // 温度触发器的滞后属性链表
void *devdata;
int trips;
unsigned long trips_disabled; /* bitmap for disabled trips */
int passive_delay;
int polling_delay; // 采集温度的时间间隔
int temperature; // 当前采集的温度
int last_temperature; // 上次采集的温度
int emul_temperature;
int passive;
unsigned int forced_passive; // 强制进入被动散热模式的标志
atomic_t need_update;
struct thermal_zone_device_ops *ops; // 区域温度设备的操作
struct thermal_zone_params *tzp; // 记录一些信息,如governor name
struct thermal_governor *governor; // 温控策略
void *governor_data;
struct list_head thermal_instances; // 降温设备
struct idr idr; // 管理热区设备实例的ID
struct mutex lock;
struct list_head node; // 热区设备的链表节点
struct delayed_work poll_queue; // 用于轮询区域温度
};
struct thermal_zone_params {
char governor_name[THERMAL_NAME_LENGTH];
/*
* a boolean to indicate if the thermal to hwmon sysfs interface
* is required. when no_hwmon == false, a hwmon sysfs interface
* will be created. when no_hwmon == true, nothing will be done
*/
bool no_hwmon;
int num_tbps; /* Number of tbp entries */
struct thermal_bind_params *tbp;
/*
* Sustainable power (heat) that this thermal zone can dissipate in
* mW
*/
u32 sustainable_power;
/*
* Proportional parameter of the PID controller when
* overshooting (i.e., when temperature is below the target)
*/
s32 k_po;
/*
* Proportional parameter of the PID controller when
* undershooting
*/
s32 k_pu;
/* Integral parameter of the PID controller */
s32 k_i;
/* Derivative parameter of the PID controller */
s32 k_d;
/* threshold below which the error is no longer accumulated */
s32 integral_cutoff;
/*
* @slope: slope of a linear temperature adjustment curve.
* Used by thermal zone drivers.
*/
int slope;
/*
* @offset: offset of a linear temperature adjustment curve.
* Used by thermal zone drivers (default 0).
*/
int offset;
};
struct thermal_zone_device_ops {
// 绑定一个降温设备到该热区设备
int (*bind) (struct thermal_zone_device *,
struct thermal_cooling_device *);
// 解绑一个降温设备从该热区设备
int (*unbind) (struct thermal_zone_device *,
struct thermal_cooling_device *);
// 获取当前热区设备的温度
int (*get_temp) (struct thermal_zone_device *, int *);
// 获取当前热区设备的工作模式
int (*get_mode) (struct thermal_zone_device *,
enum thermal_device_mode *);
// 设置当前热区设备的工作模式
int (*set_mode) (struct thermal_zone_device *,
enum thermal_device_mode);
// 获取指定温度触发器的触发类型
int (*get_trip_type) (struct thermal_zone_device *, int,
enum thermal_trip_type *);
// 获取触发等级对应的温度
int (*get_trip_temp) (struct thermal_zone_device *, int, int *);
// 设置触发等级对应的温度
int (*set_trip_temp) (struct thermal_zone_device *, int, int);
int (*get_trip_hyst) (struct thermal_zone_device *, int, int *);
int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
int (*get_crit_temp) (struct thermal_zone_device *, int *);
int (*set_emul_temp) (struct thermal_zone_device *, int);
// 获取温度的变化趋势
int (*get_trend) (struct thermal_zone_device *, int,
enum thermal_trend *);
int (*notify) (struct thermal_zone_device *, int,
enum thermal_trip_type);
};
// 内核将温控策略抽象为结构体struct thermal_governor,
//主要成员包括:char name[THERMAL_NAME_LENGTH]策略名称;int (*throttle)()温控决策等等。
struct thermal_governor {
char name[THERMAL_NAME_LENGTH];
int (*bind_to_tz)(struct thermal_zone_device *tz);
void (*unbind_from_tz)(struct thermal_zone_device *tz);
int (*throttle)(struct thermal_zone_device *tz, int trip);
struct list_head governor_list;
};
// 执行温控策略的设备成为区域降温设备,
//内核抽象为结构体struct thermal_cooling_device,struct thermal_cooling_device_ops是区域降温设备的操作集合。
struct thermal_cooling_device {
int id; //每个thermal_cooling_device有独立的id
char type[THERMAL_NAME_LENGTH]; // 名称
struct device device;
struct device_node *np;
void *devdata;
const struct thermal_cooling_device_ops *ops;
bool updated; /* true if the cooling device does not need update */
struct mutex lock; /* protect thermal_instances list */
struct list_head thermal_instances;
struct list_head node;
};
struct thermal_cooling_device_ops {
//获取总的状态数,相当于降温等级
int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
//获取当前状态
int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
//设置状态
int (*set_cur_state) (struct thermal_cooling_device *, unsigned long);
// 获取所请求的功率
int (*get_requested_power)(struct thermal_cooling_device *,
struct thermal_zone_device *, u32 *);
// 将指定状态(降温等级)转换为对应的功率
int (*state2power)(struct thermal_cooling_device *,
struct thermal_zone_device *, unsigned long, u32 *);
// 将指定功率转换为对应的状态(降温等级)
int (*power2state)(struct thermal_cooling_device *,
struct thermal_zone_device *, u32, unsigned long *);
};
2.1.1 初始化
2.1.1.1 thermal_governor注册
以step_wise governor为例,
int thermal_gov_step_wise_register(void)
{
// 调用thermal_core.c中的方法
return thermal_register_governor(&thermal_gov_step_wise);
}
static int __init thermal_init(void)
{
int result;
// 注册所有的governors
result = thermal_register_governors();
if (result)
goto error;
result = class_register(&thermal_class);
if (result)
goto unregister_governors;
result = genetlink_init();
if (result)
goto unregister_class;
result = of_parse_thermal_zones();
if (result)
goto exit_netlink;
result = register_pm_notifier(&thermal_pm_nb);
if (result)
pr_warn("Thermal: Can not register suspend notifier, return %d\n",
result);
return 0;
exit_netlink:
genetlink_exit();
unregister_class:
class_unregister(&thermal_class);
unregister_governors:
thermal_unregister_governors();
error:
idr_destroy(&thermal_tz_idr);
idr_destroy(&thermal_cdev_idr);
mutex_destroy(&thermal_idr_lock);
mutex_destroy(&thermal_list_lock);
mutex_destroy(&thermal_governor_lock);
return result;
}
static int __init thermal_register_governors(void)
{
int result;
// 调用step_wise governor中的方法,为系统默认的gov
result = thermal_gov_step_wise_register();
if (result)
return result;
result = thermal_gov_fair_share_register();
if (result)
return result;
result = thermal_gov_bang_bang_register();
if (result)
return result;
result = thermal_gov_user_space_register();
if (result)
return result;
// 注册IPA governor
return thermal_gov_power_allocator_register();
}
// 将第一个注册的governor设置为系统默认governor,即step_wise governor
int thermal_register_governor(struct thermal_governor *governor)
{
int err;
const char *name;
struct thermal_zone_device *pos;
if (!governor)
return -EINVAL;
mutex_lock(&thermal_governor_lock);
err = -EBUSY;
if (__find_governor(governor->name) == NULL) {
err = 0;
//链接到thermal_governor_list
list_add(&governor->governor_list, &thermal_governor_list);
if (!def_governor && !strncmp(governor->name,
DEFAULT_THERMAL_GOVERNOR, THERMAL_NAME_LENGTH))
def_governor = governor; //第一个设置为def_governor
}
.......
}
2.1.1.2 thermal_zone_device注册
struct thermal_zone_device *thermal_zone_device_register(const char *type,
int trips, int mask, void *devdata,
struct thermal_zone_device_ops *ops,
struct thermal_zone_params *tzp,
int passive_delay, int polling_delay)
{
struct thermal_zone_device *tz;
enum thermal_trip_type trip_type;
int trip_temp;
int result;
int count;
int passive = 0;
struct thermal_governor *governor;
.........................................................................
//分配内存
tz = kzalloc(sizeof(struct thermal_zone_device), GFP_KERNEL);
.........................................................................
//初始化idr,并获取id
idr_init(&tz->idr);
mutex_init(&tz->lock);
result = get_idr(&thermal_tz_idr, &thermal_idr_lock, &tz->id);
..........................................................................
strlcpy(tz->type, type ? : "", sizeof(tz->type)); //设置名称
tz->ops = ops; //操作集合
tz->tzp = tzp; //参数
tz->device.class = &thermal_class;
tz->devdata = devdata;
tz->trips = trips;
tz->passive_delay = passive_delay;
tz->polling_delay = polling_delay; //采集时间间隔
/* A new thermal zone needs to be updated anyway. */
atomic_set(&tz->need_update, 1);
........................................................................
//根据governor name,设置降温策略
if (tz->tzp)
governor = __find_governor(tz->tzp->governor_name);
else
governor = def_governor;
.........
//链接到thermal_tz_list
mutex_lock(&thermal_list_lock);
list_add_tail(&tz->node, &thermal_tz_list);
mutex_unlock(&thermal_list_lock);
/* 尝试绑定已注册的降温设备 */
bind_tz(tz);
thermal_zone_device_reset(tz);
/* Update the new thermal zone and mark it as already updated. */
if (atomic_cmpxchg(&tz->need_update, 1, 0))
thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
return tz;
..........
}
2.1.1.3 thermal_cooling_device注册
struct thermal_cooling_device *
thermal_cooling_device_register(char *type, void *devdata,
const struct thermal_cooling_device_ops *ops)
{
return __thermal_cooling_device_register(NULL, type, devdata, ops);
}
static struct thermal_cooling_device *
__thermal_cooling_device_register(struct device_node *np,
char *type, void *devdata,
const struct thermal_cooling_device_ops *ops)
{
struct thermal_cooling_device *cdev;
struct thermal_zone_device *pos = NULL;
int result;
if (type && strlen(type) >= THERMAL_NAME_LENGTH)
return ERR_PTR(-EINVAL);
if (!ops || !ops->get_max_state || !ops->get_cur_state ||
!ops->set_cur_state)
return ERR_PTR(-EINVAL);
// 分配内存
cdev = kzalloc(sizeof(struct thermal_cooling_device), GFP_KERNEL);
if (!cdev)
return ERR_PTR(-ENOMEM);
result = get_idr(&thermal_cdev_idr, &thermal_idr_lock, &cdev->id);
if (result) {
kfree(cdev);
return ERR_PTR(result);
}
strlcpy(cdev->type, type ? : "", sizeof(cdev->type));
mutex_init(&cdev->lock);
INIT_LIST_HEAD(&cdev->thermal_instances);
// 初始化成员,将mtk的ops和devdata赋值给thermal_cooling_device
cdev->np = np;
cdev->ops = ops;
cdev->updated = false;
cdev->device.class = &thermal_class;
cdev->device.groups = cooling_device_attr_groups;
cdev->devdata = devdata;
dev_set_name(&cdev->device, "cooling_device%d", cdev->id);
// 注册device
result = device_register(&cdev->device);
if (result) {
release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
kfree(cdev);
return ERR_PTR(result);
}
/* Add 'this' new cdev to the global cdev list */
// 新的thermal_cooling_device加入到thermal_cdev_list链表
mutex_lock(&thermal_list_lock);
list_add(&cdev->node, &thermal_cdev_list);
mutex_unlock(&thermal_list_lock);
/* Update binding information for 'this' new cdev */
// 尝试绑定到已注册的温度采集设备thermal_zone_device
bind_cdev(cdev);
mutex_lock(&thermal_list_lock);
list_for_each_entry(pos, &thermal_tz_list, node)
if (atomic_cmpxchg(&pos->need_update, 1, 0))
thermal_zone_device_update(pos);
mutex_unlock(&thermal_list_lock);
return cdev;
}
static void bind_cdev(struct thermal_cooling_device *cdev)
{
int i, ret;
const struct thermal_zone_params *tzp;
struct thermal_zone_device *pos = NULL;
mutex_lock(&thermal_list_lock);
// 遍历thermal_zone_device list,逐个绑定thermal_cooling_device
list_for_each_entry(pos, &thermal_tz_list, node) {
if (!pos->tzp && !pos->ops->bind)
continue;
if (pos->ops->bind) {
// 调用 thermal_zone_device中thermal_zone_device_ops成员中的bind方法
ret = pos->ops->bind(pos, cdev);
if (ret)
print_bind_err_msg(pos, cdev, ret);
continue;
}
tzp = pos->tzp;
if (!tzp || !tzp->tbp)
continue;
for (i = 0; i < tzp->num_tbps; i++) {
if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
continue;
if (tzp->tbp[i].match(pos, cdev))
continue;
tzp->tbp[i].cdev = cdev;
__bind(pos, tzp->tbp[i].trip_mask, cdev,
tzp->tbp[i].binding_limits,
tzp->tbp[i].weight);
}
}
mutex_unlock(&thermal_list_lock);
}
2.1.2 温度采集设备与降温设备的联系
同一个温度采集设备可以对应多个降温设备,结构体strcut thermal_instance用于连接温度采集设备与降温设备,成员struct thermal_zone_device *tz是对应的温度采集设备,struct thermal_cooling_device *cdev是对应的降温设备,int trip触发等级(对应一个温度),当温度采集设备采集的温度达到一定值时,调用对应trip等级的降温设备。
struct thermal_instance {
.................................................................
struct thermal_zone_device *tz; //对应温度采集设备
struct thermal_cooling_device *cdev; //对应降温设备
int trip; //触发等级
struct list_head tz_node; //链接到温度采集设备
struct list_head cdev_node; //链接到降温设备
.................................................................
};
以温度采集设备绑定降温设备为例,当温度采集设备注册时会尝试绑定所有已经注册的降温设备。以CPU为例,bind接口对应的是tscpu_bind(),从代码中可以看出如果降温设备的名称为g_bind0--g_bind9中的一个将会绑定CPU温度采集设备和降温设备。tscpu_bind()接口中也定义了各种名称降温设备对应的触发等级。
static void bind_tz(struct thermal_zone_device *tz)
{
int i, ret;
struct thermal_cooling_device *pos = NULL;
const struct thermal_zone_params *tzp = tz->tzp;
if (!tzp && !tz->ops->bind)
return;
mutex_lock(&thermal_list_lock);
if (tz->ops->bind) {
//尝试绑定所有的已经注册的降温设备
list_for_each_entry(pos, &thermal_cdev_list, node) {
ret = tz->ops->bind(tz, pos);
if (ret)
print_bind_err_msg(tz, pos, ret);
}
goto exit;
}
...........................................................
}
static int tscpu_bind(struct thermal_zone_device *thermal, struct thermal_cooling_device *cdev)
{
int table_val = 0;
if (!strcmp(cdev->type, g_bind0)) {
table_val = 0;
tscpu_config_all_tc_hw_protect(trip_temp[0], tc_mid_trip);
} else if (!strcmp(cdev->type, g_bind1)) {
table_val = 1;
tc_mid_trip = trip_temp[1];
tscpu_config_all_tc_hw_protect(trip_temp[0], tc_mid_trip);
} else if (!strcmp(cdev->type, g_bind2)) {
table_val = 2;
} else if (!strcmp(cdev->type, g_bind3)) {
table_val = 3;
} else if (!strcmp(cdev->type, g_bind4)) {
.....................................................
} else {
return 0;
}
//以table_val为触发等级绑定发热设备和降温设备
if (mtk_thermal_zone_bind_cooling_device(thermal, table_val, cdev)) {
tscpu_warn("tscpu_bind error binding cooling dev\n");
return -EINVAL;
}
tscpu_printk("tscpu_bind binding OK, %d\n", table_val);
return 0;
}
温度采集设备知道了触发等级和降温温度,还需要知道触发等级对应的温度。thermal_zone_device_ops的get_trip_temp()用于查询触发等级对应的温度,以mtkcpu为例,所有降温设备的触发温度保存在数据中,触发等级就是该数组的下标。
static int tscpu_get_trip_temp
(struct thermal_zone_device *thermal, int trip, int *temp)
{
*temp = trip_temp[trip];
return 0;
}
2.1.3 cooling device
2.1.3.1 cpu coolig
cpufreq_state2power:根据cpu cooling state换算cpu power。
static int cpufreq_state2power(struct thermal_cooling_device *cdev,
struct thermal_zone_device *tz,
unsigned long state, u32 *power)
{
unsigned int freq, num_cpus;
cpumask_t cpumask;
u32 static_power, dynamic_power;
int ret;
struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask);
// 根据cpumask得到在线cpu核数
num_cpus = cpumask_weight(&cpumask);
/* None of our cpus are online, so no power */
if (num_cpus == 0) {
*power = 0;
return 0;
}
// 根据cpu state得到当前的频率
freq = cpufreq_device->freq_table[state];
if (!freq)
return -EINVAL;
// 计算当前频率下的cpu动态功耗
dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus;
// 计算当前频率下的cpu静态功耗
ret = get_static_power(cpufreq_device, tz, freq, &static_power);
if (ret)
return ret;
// 计算当前频率下的cpu总的功耗
*power = static_power + dynamic_power;
return 0;
}
cpufreq_power2state:根据cpu power换算cpu cooling state.
static int cpufreq_power2state(struct thermal_cooling_device *cdev,
struct thermal_zone_device *tz, u32 power,
unsigned long *state)
{
unsigned int cpu, cur_freq, target_freq;
int ret;
s32 dyn_power;
u32 last_load, normalised_power, static_power;
struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
/* None of our cpus are online */
if (cpu >= nr_cpu_ids)
return -ENODEV;
// 计算当前cpu频率
cur_freq = cpufreq_quick_get(cpu);
// 计算当前频率下的静态功耗
ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power);
if (ret)
return ret;
// 计算当前频率下的动态功耗
dyn_power = power - static_power;
dyn_power = dyn_power > 0 ? dyn_power : 0;
last_load = cpufreq_device->last_load ?: 1;
// 计算归一化功耗
normalised_power = (dyn_power * 100) / last_load;
// 根据归一化功耗计算出目标频率
target_freq = cpu_power_to_freq(cpufreq_device, normalised_power);
// 根据目标频率得到cpu state
*state = cpufreq_cooling_get_level(cpu, target_freq);
if (*state == THERMAL_CSTATE_INVALID) {
dev_warn_ratelimited(&cdev->device,
"Failed to convert %dKHz for cpu %d into a cdev state\n",
target_freq, cpu);
return -EINVAL;
}
trace_thermal_power_cpu_limit(&cpufreq_device->allowed_cpus,
target_freq, *state, power);
return 0;
}
static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
u32 freq)
{
int i;
for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
if (freq > cpufreq_cdev->em->table[i].frequency)
break;
}
// 查表获取
return cpufreq_cdev->em->table[i + 1].power;
}
遍历了一下cpufreq_cdev里的em->table,这个table蕴含了freq和power的对应关系,这个table是跟芯片密切相关的,往往在出厂的时候厂商就已经预制好了。
2.1.4 thermal governor 实现原理
Thermal Governor:主要包括gov_bang_bang、gov_fair_share、gov_power_allocator、gov_step_wise、gov_user_space等。目前最常用的为gov_power_allocator。
IPA核心是利用PID控制器,Thermal Zone的温度作为输入,可分配功耗值作为输出,调节Allocator的频率和电压值。PID控制器在Sustainable Power基础上,根据当前温度和Control Temp之间的差值,来调节可分配功耗值的大小,进而调节Cooling设备的状态,也即调整OPP(Voltage和Frequency组合)。PID控制器的参数P、I、D也存在一定的经验值。需要测试几组不同参数,然后看温度控制效果。
Sustainable Power:是在不同OPP情境下,某一个最大OPP的温度保持基本稳定。比其大者,温度上升明显;比其小者温度保持不变或者下降。这可以通过监测不同OPP对应的温度值,得到一个Sustainable Power。
control_temp:当温度高于control_temp温度,IPA就会启动,进行OPP调节
dynamic-power-coefficient:动态功耗系数
IPA模型作用:
1)根据当前温度和控制温度差值计算出功耗,然后基于PID控制器调节OPP
2)根据当前环境预估下一个场景功耗值,包括动态功耗和静态功耗,动态功耗=c*v*v*f,静态功耗=v*F(temp)
,需要根据实测得到的数据,进行分析得到最吻合数据的一组算式。
2.4.1.1 IPA实现流程
1)IPA工作流程
实际的控制流程其实分成几部分:
1.各个器件(thermal cooling device)根据自己的performance状态计算自己需求的状态。
2.IPA governor收集各个器件的状态信息,同时收集器件的温度Tdie、Tskin等。
3.IPA governor根据器件的状态信息和温度信息,同时根据前刻的状态信息,计算出此时能够赋给各个cooling device的状态信息
4.各个cooling device得到自己的状态信息,根据governor要求调整自己的状态。
2)实际governor的工作原理
3)ARM文档手册里的图来说明功耗是如何在请求和实际需求之间分配
当需求小于给予时,大家总是能够相安无事的。矛盾总之发生在需求和给予不能相互满足的时候,即requested_power > granted_power时,我们这里可以看到各个actor的request_power,little和big的granted_power是小于其request_power的,不够分了!
这是我们把目光投到了GPU身上,大哥这个时候不用power!有多的拿来分,那可以的,于是多余的granted_power就被按需分配到了big和Little核心上。
每个actor分多少呢? 那就要看他们的需求超过了多少,需求的越多,最后拿到的也就越多。
具体代码实现可参考divvy_up_power函数。
4)IPA代码流程
代码上主要由三部分组成:
1.thermal_core中主要执行Thermal_governor的轮询逻辑,即去轮询thermal_zone的温度,看看是否需要进行温控措施;
2.gov_power_allocator中主要执行thermal_governor的总体计算过程,即统计各个部件中的状态信息,再计算这个时候可以分配的功耗,最终将实际可分配的功耗计算出来下发给设备;
3.各个cooling_device设备中主要执行的是跟自身设备有关的计算逻辑,主要是计算降温设备所需的功耗、状态到功耗(state2power)、功耗到状态(power2state);
5)IPA代码详解
核心结构体,
/**
* struct thermal_trip - representation of a point in temperature domain
* @temperature: temperature value in miliCelsius
* @hysteresis: relative hysteresis in miliCelsius
* @type: trip point type
* @priv: pointer to driver data associated with this trip
*/
struct thermal_trip {
int temperature;
int hysteresis;
enum thermal_trip_type type;
void *priv;
};
/**
* struct power_allocator_params - parameters for the power allocator governor
* @allocated_tzp: whether we have allocated tzp for this thermal zone and
* it needs to be freed on unbind
* @err_integral: accumulated error in the PID controller.
* @prev_err: error in the previous iteration of the PID controller.
* Used to calculate the derivative term.
* @sustainable_power: Sustainable power (heat) that this thermal zone can
* dissipate
* @trip_switch_on: first passive trip point of the thermal zone. The
* governor switches on when this trip point is crossed.
* If the thermal zone only has one passive trip point,
* @trip_switch_on should be NULL.
* @trip_max_desired_temperature: last passive trip point of the thermal
* zone. The temperature we are
* controlling for.
*/
struct power_allocator_params {
bool allocated_tzp;
s64 err_integral;
s32 prev_err;
u32 sustainable_power; // PID计算得到的power
const struct thermal_trip *trip_switch_on; // 触发温度
const struct thermal_trip *trip_max_desired_temperature; // 控制温度
};
power_allocator_throttle:IPA启动的入口函数
static int power_allocator_throttle(struct thermal_zone_device *tz,
const struct thermal_trip *trip)
{
struct power_allocator_params *params = tz->governor_data;
bool update;
lockdep_assert_held(&tz->lock);
/*
* We get called for every trip point but we only need to do
* our calculations once
*/
if (trip != params->trip_max_desired_temperature)
return 0;
trip = params->trip_switch_on;
// 如果设备温度小于触发温度,直接return
if (trip && tz->temperature < trip->temperature) {
update = tz->last_temperature >= trip->temperature;
tz->passive = 0;
reset_pid_controller(params);
allow_maximum_power(tz, update);
return 0;
}
tz->passive = 1;
// 如果设备温度大于触发温度(如35℃时开启控制,控制可以缓慢介入,期望控制到温度为50℃时,
// 保持设备的温度不要继续上升),根据控制温度分配power
return allocate_power(tz, params->trip_max_desired_temperature->temperature);
}
// allocate_power是IPA控制的核心函数,包括进行功耗的计算、功耗的限制要求以及功耗的重新分配
static int allocate_power(struct thermal_zone_device *tz, int control_temp)
{
struct thermal_instance *instance;
struct power_allocator_params *params = tz->governor_data;
const struct thermal_trip *trip_max_desired_temperature =
params->trip_max_desired_temperature;
/* req_power: 各个actor根据自己的状态计算出的实际工作状态的值 */
/* max_power: 各个actor的最大状态能够使用的power值 */
/* granted_power:各个actor在当前状态下能够分配到的功耗值 */
/* extra_actor_power:各个actor如果没有用完granted_power,剩下来可以分配给其他actor的power*/
/* weighted_req_power: 各个cooling device有分配功耗的权重,经过权重计算后的功耗*/
/* power_range: 经过PID控制后得到的power值 */
u32 *req_power, *max_power, *granted_power, *extra_actor_power;
u32 *weighted_req_power;
u32 total_req_power, max_allocatable_power, total_weighted_req_power;
u32 total_granted_power, power_range;
int i, num_actors, total_weight, ret = 0;
num_actors = 0;
total_weight = 0;
// 1. 遍历所有的cooling device,判断是否具备电源控制功能,统计num_actors、total_weight
list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
if ((instance->trip == trip_max_desired_temperature) &&
// 判断cooling device是否具备电源控制功能
cdev_is_power_actor(instance->cdev)) {
num_actors++;
total_weight += instance->weight;
}
}
if (!num_actors)
return -ENODEV;
/*
* We need to allocate five arrays of the same size:
* req_power, max_power, granted_power, extra_actor_power and
* weighted_req_power. They are going to be needed until this
* function returns. Allocate them all in one go to simplify
* the allocation and deallocation logic.
*/
BUILD_BUG_ON(sizeof(*req_power) != sizeof(*max_power));
BUILD_BUG_ON(sizeof(*req_power) != sizeof(*granted_power));
BUILD_BUG_ON(sizeof(*req_power) != sizeof(*extra_actor_power));
BUILD_BUG_ON(sizeof(*req_power) != sizeof(*weighted_req_power));
req_power = kcalloc(num_actors * 5, sizeof(*req_power), GFP_KERNEL);
if (!req_power)
return -ENOMEM;
max_power = &req_power[num_actors];
granted_power = &req_power[2 * num_actors];
extra_actor_power = &req_power[3 * num_actors];
weighted_req_power = &req_power[4 * num_actors];
i = 0;
total_weighted_req_power = 0;
total_req_power = 0;
max_allocatable_power = 0;
list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
int weight;
struct thermal_cooling_device *cdev = instance->cdev;
if (instance->trip != trip_max_desired_temperature)
continue;
if (!cdev_is_power_actor(cdev))
continue;
// 获取所需power,查表获得
if (cdev->ops->get_requested_power(cdev, &req_power[i]))
continue;
if (!total_weight)
weight = 1 << FRAC_BITS;
else
weight = instance->weight;
weighted_req_power[i] = frac_to_int(weight * req_power[i]);
// cooling device的state对应的power
if (cdev->ops->state2power(cdev, instance->lower,
&max_power[i]))
continue;
total_req_power += req_power[i];
max_allocatable_power += max_power[i];
total_weighted_req_power += weighted_req_power[i];
i++;
}
// 计算PID后的power
power_range = pid_controller(tz, control_temp, max_allocatable_power);
// 按需分配功耗
divvy_up_power(weighted_req_power, max_power, num_actors,
total_weighted_req_power, power_range, granted_power,
extra_actor_power);
total_granted_power = 0;
i = 0;
list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
if (instance->trip != trip_max_desired_temperature)
continue;
if (!cdev_is_power_actor(instance->cdev))
continue;
/* 这里其实就是把已经计算好的power分配到各个actor上,由各个actor来完成功耗
* 到状态的转换工作
*/
power_actor_set_power(instance->cdev, instance,
granted_power[i]);
total_granted_power += granted_power[i];
i++;
}
trace_thermal_power_allocator(tz, req_power, total_req_power,
granted_power, total_granted_power,
num_actors, power_range,
max_allocatable_power, tz->temperature,
control_temp - tz->temperature);
kfree(req_power);
return ret;
}
static struct thermal_governor thermal_gov_power_allocator = {
.name = "power_allocator",
.bind_to_tz = power_allocator_bind,
.unbind_from_tz = power_allocator_unbind,
.throttle = power_allocator_throttle,
};
THERMAL_GOVERNOR_DECLARE(thermal_gov_power_allocator);
pid_controller:计算出PID参数、sustainable_power、power_range ,返回PID后的power,即power_range
/**
* pid_controller() - PID controller
* @tz: thermal zone we are operating in
* @control_temp: the target temperature in millicelsius
* @max_allocatable_power: maximum allocatable power for this thermal zone
*
* This PID controller increases the available power budget so that the
* temperature of the thermal zone gets as close as possible to
* @control_temp and limits the power if it exceeds it. k_po is the
* proportional term when we are overshooting, k_pu is the
* proportional term when we are undershooting. integral_cutoff is a
* threshold below which we stop accumulating the error. The
* accumulated error is only valid if the requested power will make
* the system warmer. If the system is mostly idle, there's no point
* in accumulating positive error.
*
* Return: The power budget for the next period.
*/
static u32 pid_controller(struct thermal_zone_device *tz,
int control_temp,
u32 max_allocatable_power)
{
s64 p, i, d, power_range;
s32 err, max_power_frac;
u32 sustainable_power;
struct power_allocator_params *params = tz->governor_data;
max_power_frac = int_to_frac(max_allocatable_power);
// 根据控制温度,计算sustainable_power
sustainable_power = get_sustainable_power(tz, params, control_temp);
// 计算控制温度和实际温度的误差
err = control_temp - tz->temperature;
err = int_to_frac(err);
/* Calculate the proportional term */
// 计算出差分项的power
p = mul_frac(err < 0 ? tz->tzp->k_po : tz->tzp->k_pu, err);
/*
* Calculate the integral term
*
* if the error is less than cut off allow integration (but
* the integral is limited to max power)
*/
// 计算出积分项的power
i = mul_frac(tz->tzp->k_i, params->err_integral);
if (err < int_to_frac(tz->tzp->integral_cutoff)) {
s64 i_next = i + mul_frac(tz->tzp->k_i, err);
if (abs(i_next) < max_power_frac) {
i = i_next;
params->err_integral += err;
}
}
/*
* Calculate the derivative term
*
* We do err - prev_err, so with a positive k_d, a decreasing
* error (i.e. driving closer to the line) results in less
* power being applied, slowing down the controller)
*/
// 计算出微分项的power
d = mul_frac(tz->tzp->k_d, err - params->prev_err);
d = div_frac(d, jiffies_to_msecs(tz->passive_delay_jiffies));
params->prev_err = err;
// 累计算出PID后的power
power_range = p + i + d;
/* feed-forward the known sustainable dissipatable power */
power_range = sustainable_power + frac_to_int(power_range);
// 如果该值小于最小值,则返回最小值,如果该值大于最大值,则返回最大值,否则返回该值本身
power_range = clamp(power_range, (s64)0, (s64)max_allocatable_power);
trace_thermal_power_allocator_pid(tz, frac_to_int(err),
frac_to_int(params->err_integral),
frac_to_int(p), frac_to_int(i),
frac_to_int(d), power_range);
return power_range;
}
divvy_up_power:按需分配power
/**
* divvy_up_power() - divvy the allocated power between the actors
* @req_power: each actor's requested power
* @max_power: each actor's maximum available power
* @num_actors: size of the @req_power, @max_power and @granted_power's array
* @total_req_power: sum of @req_power
* @power_range: total allocated power
* @granted_power: output array: each actor's granted power
* @extra_actor_power: an appropriately sized array to be used in the
* function as temporary storage of the extra power given
* to the actors
*
* This function divides the total allocated power (@power_range)
* fairly between the actors. It first tries to give each actor a
* share of the @power_range according to how much power it requested
* compared to the rest of the actors. For example, if only one actor
* requests power, then it receives all the @power_range. If
* three actors each requests 1mW, each receives a third of the
* @power_range.
*
* If any actor received more than their maximum power, then that
* surplus is re-divvied among the actors based on how far they are
* from their respective maximums.
*
* Granted power for each actor is written to @granted_power, which
* should've been allocated by the calling function.
*/
static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
u32 total_req_power, u32 power_range,
u32 *granted_power, u32 *extra_actor_power)
{
u32 extra_power, capped_extra_power;
int i;
/*
* Prevent division by 0 if none of the actors request power.
*/
if (!total_req_power)
total_req_power = 1;
capped_extra_power = 0;
extra_power = 0;
for (i = 0; i < num_actors; i++) {
u64 req_range = (u64)req_power[i] * power_range;
granted_power[i] = DIV_ROUND_CLOSEST_ULL(req_range,
total_req_power);
if (granted_power[i] > max_power[i]) {
extra_power += granted_power[i] - max_power[i];
granted_power[i] = max_power[i];
}
extra_actor_power[i] = max_power[i] - granted_power[i];
capped_extra_power += extra_actor_power[i];
}
if (!extra_power)
return;
/*
* Re-divvy the reclaimed extra among actors based on
* how far they are from the max
*/
// 如果任何actor收到的功率超过了他们的最大功率,
// 那么多余的部分将根据他们距离最大值的程度重新分配
extra_power = min(extra_power, capped_extra_power);
if (capped_extra_power > 0)
for (i = 0; i < num_actors; i++) {
u64 extra_range = (u64)extra_actor_power[i] * extra_power;
granted_power[i] += DIV_ROUND_CLOSEST_ULL(extra_range,
capped_extra_power);
}
}
power_actor_set_power:把已经计算好的power分配到各个actor上,由各个actor来完成功耗到状态的转换工作
/**
* power_actor_set_power() - limit the maximum power a cooling device consumes
* @cdev: pointer to &thermal_cooling_device
* @instance: thermal instance to update
* @power: the power in milliwatts
*
* Set the cooling device to consume at most @power milliwatts. The limit is
* expected to be a cap at the maximum power consumption.
*
* Return: 0 on success, -EINVAL if the cooling device does not
* implement the power actor API or -E* for other failures.
*/
static int power_actor_set_power(struct thermal_cooling_device *cdev,
struct thermal_instance *instance, u32 power)
{
unsigned long state;
int ret;
// power转对应的state
ret = cdev->ops->power2state(cdev, power, &state);
if (ret)
return ret;
instance->target = clamp_val(state, instance->lower, instance->upper);
mutex_lock(&cdev->lock);
__thermal_cdev_update(cdev);
mutex_unlock(&cdev->lock);
return 0;
}