当前位置：首页 > news >正文

linux thermal framework(4)_thermal governor

news 2025/8/19 9:31:22

原文：蜗窝科技linux thermal framework(4)_thermal governor

1. 介绍

thermal governor是通过一定算法控制cooling device状态来控温的在这篇文章中，我们使用一个简单的step_wise governor来说明整个过程。

2. thermal governor相关的API以及功能分析

2.1 struct thermal_governor

/** * struct thermal_governor - structure that holds thermal governor information * @name:   name of the governor * @bind_to_tz: callback called when binding to a thermal zone.  If it *      returns 0, the governor is bound to the thermal zone, *      otherwise it fails. * @unbind_from_tz: callback called when a governor is unbound from a *          thermal zone. * @throttle:   callback called for every trip point even if temperature is *      below the trip point temperature * @governor_list:  node in thermal_governor_list (in thermal_core.c) */struct thermal_governor {    //1）name, 每个governor会有一个名字，在温控的governors中，常见的有power_allocator/userspace/step_wise    char name[THERMAL_NAME_LENGTH];    //2）bind_to_tz, governor绑定thermal_zone的回调函数    int (*bind_to_tz)(struct thermal_zone_device *tz);    //3）unbind_from_tz，governor和thermal_zone解绑的回调函数    void (*unbind_from_tz)(struct thermal_zone_device *tz);    int (*throttle)(struct thermal_zone_device *tz, int trip);    //4)governor_list，thermal core通过一个链表来管理所有的governors，这个是该governor在链表中的节点    struct list_head    governor_list;    ANDROID_KABI_RESERVE(1);};

2.2 thermal_governor的注册

linux在lds中静态定义了一个governor_thermal_table,每个thermal governor会在这个table中添加一个entry，以step_wise为例：

kernel-6.6/drivers/thermal/gov_step_wise.c

/*  * 如果内核配置启用了 THERMAL（温度控制模块），则定义 THERMAL_TABLE 宏， * 否则定义为空（避免未启用时产生无效代码） */#ifdef CONFIG_THERMAL    /*      * THERMAL_TABLE 宏定义：     * 用于在内存中按 8 字节对齐方式放置温度控制相关的数据表，     * 并通过 BOUNDED_SECTION_POST_LABEL 标记其起始和结束地址     */    #define THERMAL_TABLE(name)                     \        . = ALIGN(8);                           /* 按 8 字节对齐当前地址 */ \        BOUNDED_SECTION_POST_LABEL(__##name##_thermal_table,  /* 定义符号表起始 */ \                           __##name##_thermal_table,, _end)   /* _end 表示结束位置 */#else    /* 如果未启用 THERMAL，则 THERMAL_TABLE 宏不执行任何操作 */    #define THERMAL_TABLE(name)#endif/*  * 定义 step_wise 温控策略的结构体： * 该策略通过逐步调整冷却设备（如风扇/CPU调频）来控制温度 */static struct thermal_governor thermal_gov_step_wise = {    .name   = "step_wise",      /* 策略名称（用于匹配和调试） */    .manage = step_wise_manage, /* 核心管理函数指针，实现温度调节逻辑 */};/*  * 向内核注册此温控策略： * THERMAL_GOVERNOR_DECLARE 宏会将结构体放入特定内存段（如 __thermal_governor）， * 供内核初始化时调用 */THERMAL_GOVERNOR_DECLARE(thermal_gov_step_wise);

thermal core在初始化的过程中，会遍历governor_thermal_table中所有的entry，即所有的governor，将其加进governor_list中，比较这个governor和DEFAULT_THERMAL_GOVERNOR名字是否想同，相同的话，就将系统默认的governor设置为这个governor，遍历所有的thermal_zone，如果这个governor的名字和thermal zone本身有的governor名字相同，则会设置thermal zone的governor

对governor来说，主要的回调函数就是manage，manage会在thermal zone拥有的monitor每次check thermal zone的温度的时候调用该thermal zone对应的governor->manage函数，我们可以看下step_wise的manage函数是怎么调温的

2.3 step_wise温控算法

首先我们还是用这张图结合stepwise的算法来描述以下过程：

/** * step_wise_manage - 温控策略核心管理函数（逐步调节） * @tz: 指向 thermal_zone_device 的指针，代表一个温度监控区域 * * 功能：根据温度变化趋势逐步调整冷却设备状态（如风扇转速/CPU频率）。 *      若温度上升，则逐级增强冷却；若温度下降，则逐级恢复性能。 */static void step_wise_manage(struct thermal_zone_device *tz){    const struct thermal_trip_desc *td;  // 温度触发点描述符    struct thermal_instance *instance;   // 冷却设备实例    /* 锁断言：确保调用时已持有温度区域的锁 */    lockdep_assert_held(&tz->lock);    /*     * 核心调节逻辑：     * 1. 遍历所有温度触发点（trip point），跳过无效/危险温度点     * 2. 对每个有效触发点，更新其关联冷却设备的阈值状态     * 3. 最终统一更新所有冷却设备状态     */    for_each_trip_desc(tz, td) {        const struct thermal_trip *trip = &td->trip;        /* 跳过以下触发点：         * - 未配置有效温度值（THERMAL_TEMP_INVALID）         * - 关键温度（CRITICAL，可能触发紧急关机）         * - 过热温度（HOT，可能触发强制降频）         */        if (trip->temperature == THERMAL_TEMP_INVALID ||            trip->type == THERMAL_TRIP_CRITICAL ||            trip->type == THERMAL_TRIP_HOT)            continue;        /* 更新当前触发点的温度状态（根据趋势调整阈值） */        thermal_zone_trip_update(tz, td, td->threshold);    }    /* 遍历所有触发点及其关联的冷却设备实例 */    for_each_trip_desc(tz, td) {        list_for_each_entry(instance, &td->thermal_instances, trip_node)            /* 通知冷却设备更新状态（如调整风扇档位） */            thermal_cdev_update(instance->cdev);    }}

1）首先遍历这个thermal zone所有的trip:for_each_trip_desc,过滤掉其中无效或者档位过高的trip，然后调用thermal_zone_trip_update这个函数来更新trip信息，为选择最佳cooling device state做准备

2）在做好更新后，重新遍历trip，thermal_c

继续看下thermal_zone_trip_update是如何更新trip信息的：

kernel-6.6/drivers/thermal/gov_step_wise.c

/** * thermal_zone_trip_update - 更新温控触发点状态并调整冷却设备 * @tz: 指向thermal_zone_device的指针，代表一个温度监控区域 * @trip_id: 要处理的温度触发点ID * * 功能：根据温度趋势和当前温度与触发点的关系，动态调整冷却设备状态 *       支持被动冷却（如CPU降频）和主动冷却（如风扇调速）两种模式 */static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_id){    // 获取指定触发点的配置信息    const struct thermal_trip *trip = &tz->trips[trip_id];    enum thermal_trend trend;       // 温度变化趋势（上升/下降/稳定）    struct thermal_instance *instance; // 冷却设备实例    bool throttle = false;           // 是否需要抑制性能的标志    int old_target;                  // 保存设备之前的控制状态    // 获取温度变化趋势（通过历史温度数据分析）    trend = get_tz_trend(tz, trip_id);    /* 基础节流判断：当前温度超过触发温度时激活节流 */    if (tz->temperature >= trip->temperature) {        throttle = true;        // 记录跟踪点（用于ftrace等性能分析工具）        trace_thermal_zone_trip(tz, trip_id, trip->type);    }    /* 调试日志：打印触发点关键参数 */    dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",        trip_id, trip->type, trip->temperature, trend, throttle);    /* 遍历该温控区域下的所有冷却设备实例 */    list_for_each_entry(instance, &tz->thermal_instances, tz_node) {        // 跳过不属于当前触发点的实例        if (instance->trip != trip)            continue;        // 保存当前状态用于后续比较        old_target = instance->target;        // 计算新目标状态（考虑温度趋势和节流需求）        instance->target = get_target_state(instance, trend, throttle);        /* 调试日志：显示状态变化 */        dev_dbg(&instance->cdev->device, "old_target=%d, target=%d\n",                old_target, (int)instance->target);        /* 跳过无变化且已初始化的实例（优化性能） */        if (instance->initialized && old_target == instance->target)            continue;        /*          * 被动冷却处理（如CPU调频）：         * 当设备从未激活状态变为激活时，增加被动冷却计数         */        if (old_target == THERMAL_NO_TARGET &&            instance->target != THERMAL_NO_TARGET)            update_passive_instance(tz, trip->type, 1); // 增加计数        /*          * 当设备从激活状态变为未激活时，减少被动冷却计数          */        else if (old_target != THERMAL_NO_TARGET &&               instance->target == THERMAL_NO_TARGET)            update_passive_instance(tz, trip->type, -1); // 减少计数        // 标记实例已完成初始化        instance->initialized = true;        /* 标记冷却设备需要更新状态 */        mutex_lock(&instance->cdev->lock);        instance->cdev->updated = false; // 触发后续thermal_cdev_update()        mutex_unlock(&instance->cdev->lock);    }}

1）trend = get_tz_trend，首先得到温度趋势，上升：THERMAL_TREND_RAISING or 下降：THERMAL_TREND_DROPPING

2）throttle：是否达到trip限制的温度

3）遍历这个trip所有的thermal instances，调用get_target_state获得目标状态，如果目标状态和原有状态不同，就设置cooling device的update参数为false，在后面的函数中更新为target state

get_target_state是获取目标状态的主函数：

/** * get_target_state - 根据温度趋势和节流需求计算冷却设备的目标状态 * @instance: 温控实例（关联特定触发点和冷却设备） * @trend: 当前温度变化趋势（上升/下降/稳定） * @throttle: 是否需要抑制性能（温度超过触发点时设为true） * * 返回值: 冷却设备的目标状态值（或THERMAL_NO_TARGET表示不激活） * * 核心逻辑： * 1. 温度高于触发点时： *    a. 趋势上升 → 提高冷却强度 *    b. 趋势下降 → 保持当前状态（避免频繁调整） * 2. 温度低于触发点时： *    a. 趋势上升 → 保持当前状态 *    b. 趋势下降 → 降低冷却强度（若已达下限则关闭冷却） */static unsigned long get_target_state(struct thermal_instance *instance,                enum thermal_trend trend, bool throttle){    struct thermal_cooling_device *cdev = instance->cdev;    unsigned long cur_state;      // 当前冷却状态    unsigned long next_target;    // 待计算的目标状态    /* 获取冷却设备当前状态（如风扇转速档位） */    cdev->ops->get_cur_state(cdev, &cur_state);    next_target = instance->target; // 默认保持原状态    dev_dbg(&cdev->device, "cur_state=%ld\n", cur_state);    /* 处理未初始化的实例 */    if (!instance->initialized) {        if (throttle) {            // 温度超标时：当前档位+1，并限制在有效范围内[lower,upper]            next_target = clamp((cur_state + 1), instance->lower, instance->upper);        } else {            // 温度安全时：标记为不激活            next_target = THERMAL_NO_TARGET;        }        return next_target;    }    /* 温度超过触发点时的处理 */    if (throttle) {        // 仅当温度持续上升时才提高冷却强度（避免震荡）        if (trend == THERMAL_TREND_RAISING) {            next_target = clamp((cur_state + 1), instance->lower, instance->upper);        }    }     /* 温度低于触发点时的处理 */    else {        // 仅当温度持续下降时才降低冷却强度        if (trend == THERMAL_TREND_DROPPING) {            if (cur_state <= instance->lower) {                // 已达最低档位则关闭冷却                next_target = THERMAL_NO_TARGET;            } else {                // 否则降低一档                next_target = clamp((cur_state - 1), instance->lower, instance->upper);            }        }    }    return next_target;}

target state的逻辑总结：如果当前温度高于trip温度，如果趋势是上升，选择更高的cooling 状态，如果趋势是下降，do nothing; 如果当前温度低于trip温度，如果趋势是上升，do nothing,如果趋势是下降，用更低的cooling状态，如果已经是最低的状态了，那么就deactive这个thermal instance

查看全文

http://www.dtcms.com/a/251552.html