linux-next/drivers/thermal/thermal_helpers.c

261 lines
6.4 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* thermal_helpers.c - helper functions to handle thermal devices
*
* Copyright (C) 2016 Eduardo Valentin <edubezval@gmail.com>
*
* Highly based on original thermal_core.c
* Copyright (C) 2008 Intel Corp
* Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
* Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/device.h>
#include <linux/err.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include "thermal_core.h"
#include "thermal_trace.h"
int get_tz_trend(struct thermal_zone_device *tz, const struct thermal_trip *trip)
{
enum thermal_trend trend;
if (tz->emul_temperature || !tz->ops.get_trend ||
tz->ops.get_trend(tz, trip, &trend)) {
if (tz->temperature > tz->last_temperature)
trend = THERMAL_TREND_RAISING;
else if (tz->temperature < tz->last_temperature)
trend = THERMAL_TREND_DROPPING;
else
trend = THERMAL_TREND_STABLE;
}
return trend;
}
static struct thermal_instance *get_instance(struct thermal_zone_device *tz,
struct thermal_cooling_device *cdev,
const struct thermal_trip *trip)
{
struct thermal_instance *ti;
list_for_each_entry(ti, &tz->thermal_instances, tz_node) {
if (ti->trip == trip && ti->cdev == cdev)
return ti;
}
return NULL;
}
bool thermal_trip_is_bound_to_cdev(struct thermal_zone_device *tz,
const struct thermal_trip *trip,
struct thermal_cooling_device *cdev)
{
bool ret;
mutex_lock(&tz->lock);
mutex_lock(&cdev->lock);
ret = !!get_instance(tz, cdev, trip);
mutex_unlock(&cdev->lock);
mutex_unlock(&tz->lock);
return ret;
}
EXPORT_SYMBOL_GPL(thermal_trip_is_bound_to_cdev);
struct thermal_instance *
get_thermal_instance(struct thermal_zone_device *tz,
struct thermal_cooling_device *cdev, int trip_index)
{
struct thermal_instance *ti;
mutex_lock(&tz->lock);
mutex_lock(&cdev->lock);
ti = get_instance(tz, cdev, &tz->trips[trip_index].trip);
mutex_unlock(&cdev->lock);
mutex_unlock(&tz->lock);
return ti;
}
EXPORT_SYMBOL(get_thermal_instance);
/**
* __thermal_zone_get_temp() - returns the temperature of a thermal zone
* @tz: a valid pointer to a struct thermal_zone_device
* @temp: a valid pointer to where to store the resulting temperature.
*
* When a valid thermal zone reference is passed, it will fetch its
* temperature and fill @temp.
*
* Both tz and tz->ops must be valid pointers when calling this function,
* and the tz->ops.get_temp callback must be provided.
* The function must be called under tz->lock.
*
* Return: On success returns 0, an error code otherwise
*/
int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
{
const struct thermal_trip_desc *td;
int crit_temp = INT_MAX;
int ret = -EINVAL;
lockdep_assert_held(&tz->lock);
ret = tz->ops.get_temp(tz, temp);
if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) {
for_each_trip_desc(tz, td) {
const struct thermal_trip *trip = &td->trip;
if (trip->type == THERMAL_TRIP_CRITICAL) {
crit_temp = trip->temperature;
break;
}
}
/*
* Only allow emulating a temperature when the real temperature
* is below the critical temperature so that the emulation code
* cannot hide critical conditions.
*/
if (!ret && *temp < crit_temp)
*temp = tz->emul_temperature;
}
if (ret)
dev_dbg(&tz->device, "Failed to get temperature: %d\n", ret);
return ret;
}
/**
* thermal_zone_get_temp() - returns the temperature of a thermal zone
* @tz: a valid pointer to a struct thermal_zone_device
* @temp: a valid pointer to where to store the resulting temperature.
*
* When a valid thermal zone reference is passed, it will fetch its
* temperature and fill @temp.
*
* Return: On success returns 0, an error code otherwise
*/
int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
{
int ret;
if (IS_ERR_OR_NULL(tz))
return -EINVAL;
mutex_lock(&tz->lock);
thermal/core: Ensure that thermal device is registered in thermal_zone_get_temp Calls to thermal_zone_get_temp() are not protected against thermal zone device removal. As result, it is possible that the thermal zone operations callbacks are no longer valid when thermal_zone_get_temp() is called. This may result in crashes such as BUG: unable to handle page fault for address: ffffffffc04ef420 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 5d60e067 P4D 5d60e067 PUD 5d610067 PMD 110197067 PTE 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 3209 Comm: cat Tainted: G W 5.10.136-19389-g615abc6eb807 #1 02df41ac0b12f3a64f4b34245188d8875bb3bce1 Hardware name: Google Coral/Coral, BIOS Google_Coral.10068.92.0 11/27/2018 RIP: 0010:thermal_zone_get_temp+0x26/0x73 Code: 89 c3 eb d3 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 53 48 85 ff 74 50 48 89 fb 48 81 ff 00 f0 ff ff 77 44 48 8b 83 98 03 00 00 <48> 83 78 10 00 74 36 49 89 f6 4c 8d bb d8 03 00 00 4c 89 ff e8 9f RSP: 0018:ffffb3758138fd38 EFLAGS: 00010287 RAX: ffffffffc04ef410 RBX: ffff98f14d7fb000 RCX: 0000000000000000 RDX: ffff98f17cf90000 RSI: ffffb3758138fd64 RDI: ffff98f14d7fb000 RBP: ffffb3758138fd50 R08: 0000000000001000 R09: ffff98f17cf90000 R10: 0000000000000000 R11: ffffffff8dacad28 R12: 0000000000001000 R13: ffff98f1793a7d80 R14: ffff98f143231708 R15: ffff98f14d7fb018 FS: 00007ec166097800(0000) GS:ffff98f1bbd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffc04ef420 CR3: 000000010ee9a000 CR4: 00000000003506e0 Call Trace: temp_show+0x31/0x68 dev_attr_show+0x1d/0x4f sysfs_kf_seq_show+0x92/0x107 seq_read_iter+0xf5/0x3f2 vfs_read+0x205/0x379 __x64_sys_read+0x7c/0xe2 do_syscall_64+0x43/0x55 entry_SYSCALL_64_after_hwframe+0x61/0xc6 if a thermal device is removed while accesses to its device attributes are ongoing. The problem is exposed by code in iwl_op_mode_mvm_start(), which registers a thermal zone device only to unregister it shortly afterwards if an unrelated failure is encountered while accessing the hardware. Check if the thermal zone device is registered after acquiring the thermal zone device mutex to ensure this does not happen. The code was tested by triggering the failure in iwl_op_mode_mvm_start() on purpose. Without this patch, the kernel crashes reliably. The crash is no longer observed after applying this and the preceding patches. Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2022-11-10 15:24:54 +00:00
if (!tz->ops.get_temp) {
ret = -EINVAL;
goto unlock;
}
ret = __thermal_zone_get_temp(tz, temp);
thermal: core: Allow thermal zones to tell the core to ignore them The iwlwifi wireless driver registers a thermal zone that is only needed when the network interface handled by it is up and it wants that thermal zone to be effectively ignored by the core otherwise. Before commit a8a261774466 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") that could be achieved by returning an error code from the thermal zone's .get_temp() callback because the core did not really handle errors returned by it almost at all. However, commit a8a261774466 made the core attempt to recover from the situation in which the temperature of a thermal zone cannot be determined due to errors returned by its .get_temp() and is always invalid from the core's perspective. That was done because there are thermal zones in which .get_temp() returns errors to start with due to some difficulties related to the initialization ordering, but then it will start to produce valid temperature values at one point. Unfortunately, the simple approach taken by commit a8a261774466, which is to poll the thermal zone periodically until its .get_temp() callback starts to return valid temperature values, is at odds with the special thermal zone in iwlwifi in which .get_temp() may always return an error because its network interface may always be down. If that happens, every attempt to invoke the thermal zone's .get_temp() callback resulting in an error causes the thermal core to print a dev_warn() message to the kernel log which is super-noisy. To address this problem, make the core handle the case in which .get_temp() returns 0, but the temperature value returned by it is not actually valid, in a special way. Namely, make the core completely ignore the invalid temperature value coming from .get_temp() in that case, which requires folding in update_temperature() into its caller and a few related changes. On the iwlwifi side, modify iwl_mvm_tzone_get_temp() to return 0 and put THERMAL_TEMP_INVALID into the temperature return memory location instead of returning an error when the firmware is not running or it is not of the right type. Also, to clearly separate the handling of invalid temperature values from the thermal zone initialization, introduce a special THERMAL_TEMP_INIT value specifically for the latter purpose. Fixes: a8a261774466 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") Closes: https://lore.kernel.org/linux-pm/20240715044527.GA1544@sol.localdomain/ Reported-by: Eric Biggers <ebiggers@kernel.org> Reported-by: Stefan Lippers-Hollmann <s.l-h@gmx.de> Link: https://bugzilla.kernel.org/show_bug.cgi?id=201761 Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> Tested-by: Stefan Lippers-Hollmann <s.l-h@gmx.de> Cc: 6.10+ <stable@vger.kernel.org> # 6.10+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Link: https://patch.msgid.link/4950004.31r3eYUQgx@rjwysocki.net [ rjw: Rebased on top of the current mainline ] Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2024-07-17 19:45:02 +00:00
if (!ret && *temp <= THERMAL_TEMP_INVALID)
ret = -ENODATA;
thermal/core: Ensure that thermal device is registered in thermal_zone_get_temp Calls to thermal_zone_get_temp() are not protected against thermal zone device removal. As result, it is possible that the thermal zone operations callbacks are no longer valid when thermal_zone_get_temp() is called. This may result in crashes such as BUG: unable to handle page fault for address: ffffffffc04ef420 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 5d60e067 P4D 5d60e067 PUD 5d610067 PMD 110197067 PTE 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 3209 Comm: cat Tainted: G W 5.10.136-19389-g615abc6eb807 #1 02df41ac0b12f3a64f4b34245188d8875bb3bce1 Hardware name: Google Coral/Coral, BIOS Google_Coral.10068.92.0 11/27/2018 RIP: 0010:thermal_zone_get_temp+0x26/0x73 Code: 89 c3 eb d3 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 53 48 85 ff 74 50 48 89 fb 48 81 ff 00 f0 ff ff 77 44 48 8b 83 98 03 00 00 <48> 83 78 10 00 74 36 49 89 f6 4c 8d bb d8 03 00 00 4c 89 ff e8 9f RSP: 0018:ffffb3758138fd38 EFLAGS: 00010287 RAX: ffffffffc04ef410 RBX: ffff98f14d7fb000 RCX: 0000000000000000 RDX: ffff98f17cf90000 RSI: ffffb3758138fd64 RDI: ffff98f14d7fb000 RBP: ffffb3758138fd50 R08: 0000000000001000 R09: ffff98f17cf90000 R10: 0000000000000000 R11: ffffffff8dacad28 R12: 0000000000001000 R13: ffff98f1793a7d80 R14: ffff98f143231708 R15: ffff98f14d7fb018 FS: 00007ec166097800(0000) GS:ffff98f1bbd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffc04ef420 CR3: 000000010ee9a000 CR4: 00000000003506e0 Call Trace: temp_show+0x31/0x68 dev_attr_show+0x1d/0x4f sysfs_kf_seq_show+0x92/0x107 seq_read_iter+0xf5/0x3f2 vfs_read+0x205/0x379 __x64_sys_read+0x7c/0xe2 do_syscall_64+0x43/0x55 entry_SYSCALL_64_after_hwframe+0x61/0xc6 if a thermal device is removed while accesses to its device attributes are ongoing. The problem is exposed by code in iwl_op_mode_mvm_start(), which registers a thermal zone device only to unregister it shortly afterwards if an unrelated failure is encountered while accessing the hardware. Check if the thermal zone device is registered after acquiring the thermal zone device mutex to ensure this does not happen. The code was tested by triggering the failure in iwl_op_mode_mvm_start() on purpose. Without this patch, the kernel crashes reliably. The crash is no longer observed after applying this and the preceding patches. Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2022-11-10 15:24:54 +00:00
unlock:
mutex_unlock(&tz->lock);
return ret;
}
EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
thermal/debugfs: Add thermal cooling device debugfs information The thermal framework does not have any debug information except a sysfs stat which is a bit controversial. This one allocates big chunks of memory for every cooling devices with a high number of states and could represent on some systems in production several megabytes of memory for just a portion of it. As the sysfs is limited to a page size, the output is not exploitable with large data array and gets truncated. The patch provides the same information than sysfs except the transitions are dynamically allocated, thus they won't show more events than the ones which actually occurred. There is no longer a size limitation and it opens the field for more debugging information where the debugfs is designed for, not sysfs. The thermal debugfs directory structure tries to stay consistent with the sysfs one but in a very simplified way: thermal/ -- cooling_devices |-- 0 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 1 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 2 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 3 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table `-- 4 |-- clear |-- time_in_state_ms |-- total_trans `-- trans_table The content of the files in the cooling devices directory is the same as the sysfs one except for the trans_table which has the following format: Transition Hits 1->0 246 0->1 246 2->1 632 1->2 632 3->2 98 2->3 98 Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> [ rjw: White space fixups, rebase ] Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2024-01-09 09:41:11 +00:00
static int thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev, int state)
{
thermal/debugfs: Add thermal cooling device debugfs information The thermal framework does not have any debug information except a sysfs stat which is a bit controversial. This one allocates big chunks of memory for every cooling devices with a high number of states and could represent on some systems in production several megabytes of memory for just a portion of it. As the sysfs is limited to a page size, the output is not exploitable with large data array and gets truncated. The patch provides the same information than sysfs except the transitions are dynamically allocated, thus they won't show more events than the ones which actually occurred. There is no longer a size limitation and it opens the field for more debugging information where the debugfs is designed for, not sysfs. The thermal debugfs directory structure tries to stay consistent with the sysfs one but in a very simplified way: thermal/ -- cooling_devices |-- 0 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 1 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 2 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 3 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table `-- 4 |-- clear |-- time_in_state_ms |-- total_trans `-- trans_table The content of the files in the cooling devices directory is the same as the sysfs one except for the trans_table which has the following format: Transition Hits 1->0 246 0->1 246 2->1 632 1->2 632 3->2 98 2->3 98 Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> [ rjw: White space fixups, rebase ] Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2024-01-09 09:41:11 +00:00
int ret;
/*
* No check is needed for the ops->set_cur_state as the
* registering function checked the ops are correctly set
*/
ret = cdev->ops->set_cur_state(cdev, state);
if (ret)
return ret;
thermal_notify_cdev_state_update(cdev, state);
thermal_cooling_device_stats_update(cdev, state);
thermal_debug_cdev_state_update(cdev, state);
return 0;
}
void __thermal_cdev_update(struct thermal_cooling_device *cdev)
{
struct thermal_instance *instance;
unsigned long target = 0;
/* Make sure cdev enters the deepest cooling state */
list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
dev_dbg(&cdev->device, "zone%d->target=%lu\n",
instance->tz->id, instance->target);
if (instance->target == THERMAL_NO_TARGET)
continue;
if (instance->target > target)
target = instance->target;
}
thermal: Add cooling device's statistics in sysfs This extends the sysfs interface for thermal cooling devices and exposes some pretty useful statistics. These statistics have proven to be quite useful specially while doing benchmarks related to the task scheduler, where we want to make sure that nothing has disrupted the test, specially the cooling device which may have put constraints on the CPUs. The information exposed here tells us to what extent the CPUs were constrained by the thermal framework. The write-only "reset" file is used to reset the statistics. The read-only "time_in_state_ms" file shows the time (in msec) spent by the device in the respective cooling states, and it prints one line per cooling state. The read-only "total_trans" file shows single positive integer value showing the total number of cooling state transitions the device has gone through since the time the cooling device is registered or the time when statistics were reset last. The read-only "trans_table" file shows a two dimensional matrix, where an entry <i,j> (row i, column j) represents the number of transitions from State_i to State_j. This is how the directory structure looks like for a single cooling device: $ ls -R /sys/class/thermal/cooling_device0/ /sys/class/thermal/cooling_device0/: cur_state max_state power stats subsystem type uevent /sys/class/thermal/cooling_device0/power: autosuspend_delay_ms runtime_active_time runtime_suspended_time control runtime_status /sys/class/thermal/cooling_device0/stats: reset time_in_state_ms total_trans trans_table This is tested on ARM 64-bit Hisilicon hikey620 board running Ubuntu and ARM 64-bit Hisilicon hikey960 board running Android. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Zhang Rui <rui.zhang@intel.com>
2018-04-02 10:56:25 +00:00
thermal_cdev_set_cur_state(cdev, target);
thermal: Add cooling device's statistics in sysfs This extends the sysfs interface for thermal cooling devices and exposes some pretty useful statistics. These statistics have proven to be quite useful specially while doing benchmarks related to the task scheduler, where we want to make sure that nothing has disrupted the test, specially the cooling device which may have put constraints on the CPUs. The information exposed here tells us to what extent the CPUs were constrained by the thermal framework. The write-only "reset" file is used to reset the statistics. The read-only "time_in_state_ms" file shows the time (in msec) spent by the device in the respective cooling states, and it prints one line per cooling state. The read-only "total_trans" file shows single positive integer value showing the total number of cooling state transitions the device has gone through since the time the cooling device is registered or the time when statistics were reset last. The read-only "trans_table" file shows a two dimensional matrix, where an entry <i,j> (row i, column j) represents the number of transitions from State_i to State_j. This is how the directory structure looks like for a single cooling device: $ ls -R /sys/class/thermal/cooling_device0/ /sys/class/thermal/cooling_device0/: cur_state max_state power stats subsystem type uevent /sys/class/thermal/cooling_device0/power: autosuspend_delay_ms runtime_active_time runtime_suspended_time control runtime_status /sys/class/thermal/cooling_device0/stats: reset time_in_state_ms total_trans trans_table This is tested on ARM 64-bit Hisilicon hikey620 board running Ubuntu and ARM 64-bit Hisilicon hikey960 board running Android. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Zhang Rui <rui.zhang@intel.com>
2018-04-02 10:56:25 +00:00
trace_cdev_update(cdev, target);
dev_dbg(&cdev->device, "set to state %lu\n", target);
}
/**
* thermal_cdev_update - update cooling device state if needed
* @cdev: pointer to struct thermal_cooling_device
*
* Update the cooling device state if there is a need.
*/
void thermal_cdev_update(struct thermal_cooling_device *cdev)
{
mutex_lock(&cdev->lock);
if (!cdev->updated) {
__thermal_cdev_update(cdev);
cdev->updated = true;
}
mutex_unlock(&cdev->lock);
}
/**
* thermal_zone_get_slope - return the slope attribute of the thermal zone
* @tz: thermal zone device with the slope attribute
*
* Return: If the thermal zone device has a slope attribute, return it, else
* return 1.
*/
int thermal_zone_get_slope(struct thermal_zone_device *tz)
{
if (tz && tz->tzp)
return tz->tzp->slope;
return 1;
}
EXPORT_SYMBOL_GPL(thermal_zone_get_slope);
/**
* thermal_zone_get_offset - return the offset attribute of the thermal zone
* @tz: thermal zone device with the offset attribute
*
* Return: If the thermal zone device has a offset attribute, return it, else
* return 0.
*/
int thermal_zone_get_offset(struct thermal_zone_device *tz)
{
if (tz && tz->tzp)
return tz->tzp->offset;
return 0;
}
EXPORT_SYMBOL_GPL(thermal_zone_get_offset);