From 6e47c6e737082110386aaad6f7a6984193b29966 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 4 Dec 2024 14:02:04 +0100 Subject: [PATCH 01/70] PM: sleep: Update stale comment in device_resume() There is no function called __device_suspend() any more and it is still mentioned in a comment in device_resume(), so update that comment. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2787627.mvXUDI8C0e@rjwysocki.net --- drivers/base/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 4a67e83300e1..495ca6d7193a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -914,7 +914,7 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) goto Complete; if (dev->power.direct_complete) { - /* Match the pm_runtime_disable() in __device_suspend(). */ + /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; } From d6482311efa063da6d01c3c0cd57df81e743a480 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 18 Nov 2024 08:29:14 +0100 Subject: [PATCH 02/70] PM: sleep: autosleep: don't include 'pm_wakeup.h' directly The header clearly states that it does not want to be included directly, only via 'device.h'. 'platform_device.h' works equally well. Remove the direct inclusion. Signed-off-by: Wolfram Sang Link: https://patch.msgid.link/20241118072917.3853-16-wsa+renesas@sang-engineering.com [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- kernel/power/autosleep.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index b29c8aca7486..865df641b97c 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -9,7 +9,6 @@ #include #include -#include #include "power.h" From cb7595225ab725db4a0cf24981d1d12882fd65e7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 18 Nov 2024 08:29:01 +0100 Subject: [PATCH 03/70] PM: sleep: sysfs: don't include 'pm_wakeup.h' directly The header clearly states that it does not want to be included directly, only via 'device.h'. 'platform_device.h' works equally well. Remove the direct inclusion. Signed-off-by: Wolfram Sang Link: https://patch.msgid.link/20241118072917.3853-3-wsa+renesas@sang-engineering.com [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index f8163b559bf9..f84018125b46 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include "power.h" From ed33fbb5d5536db77bd168e8fefde8dbc077cfd7 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 3 Oct 2024 11:01:30 +0200 Subject: [PATCH 04/70] PM / devfreq: event: Call of_node_put() only once in devfreq_event_get_edev_by_phandle() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An of_node_put(node) call was immediately used after a null pointer check for the local variable “edev” at the end of this function implementation. Thus call such a function only once instead directly before the check. This issue was transformed by using the Coccinelle software. Link: https://lore.kernel.org/lkml/0f103384-376c-41f0-a35c-8ad98327d6cb@web.de/ Signed-off-by: Markus Elfring Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq-event.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/devfreq/devfreq-event.c b/drivers/devfreq/devfreq-event.c index 3ebac2496679..70219099c604 100644 --- a/drivers/devfreq/devfreq-event.c +++ b/drivers/devfreq/devfreq-event.c @@ -244,13 +244,9 @@ struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(struct device *dev, edev = NULL; out: mutex_unlock(&devfreq_event_list_lock); - - if (!edev) { - of_node_put(node); - return ERR_PTR(-ENODEV); - } - of_node_put(node); + if (!edev) + return ERR_PTR(-ENODEV); return edev; } From 694389cd2bdfc6bc646bbb0fd2a5684c5e8d5fbf Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 22 Nov 2024 15:47:57 +0800 Subject: [PATCH 05/70] selftests/cpufreq: gitignore output files and clean them in make clean After `make run_tests`, the git status complains: Untracked files: (use "git add ..." to include in what will be committed) cpufreq/cpufreq_selftest.dmesg_cpufreq.txt cpufreq/cpufreq_selftest.dmesg_full.txt cpufreq/cpufreq_selftest.txt Link: https://lore.kernel.org/all/20241122074757.1583002-1-lizhijian@fujitsu.com/ Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: Shuah Khan Signed-off-by: Li Zhijian Acked-by: Viresh Kumar Signed-off-by: Shuah Khan --- tools/testing/selftests/cpufreq/.gitignore | 2 ++ tools/testing/selftests/cpufreq/Makefile | 1 + 2 files changed, 3 insertions(+) create mode 100644 tools/testing/selftests/cpufreq/.gitignore diff --git a/tools/testing/selftests/cpufreq/.gitignore b/tools/testing/selftests/cpufreq/.gitignore new file mode 100644 index 000000000000..67604e91e068 --- /dev/null +++ b/tools/testing/selftests/cpufreq/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +cpufreq_selftest.* diff --git a/tools/testing/selftests/cpufreq/Makefile b/tools/testing/selftests/cpufreq/Makefile index c86ca8342222..9b2ccb10b0cf 100644 --- a/tools/testing/selftests/cpufreq/Makefile +++ b/tools/testing/selftests/cpufreq/Makefile @@ -3,6 +3,7 @@ all: TEST_PROGS := main.sh TEST_FILES := cpu.sh cpufreq.sh governor.sh module.sh special-tests.sh +EXTRA_CLEAN := cpufreq_selftest.dmesg_cpufreq.txt cpufreq_selftest.dmesg_full.txt cpufreq_selftest.txt include ../lib.mk From 3075476a7af666de3ec10b4f35d8e62db8fd5b6d Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 29 Nov 2024 09:20:05 +0800 Subject: [PATCH 06/70] pm: cpupower: Makefile: Fix cross compilation After commit f79473ed9220 ("pm: cpupower: Makefile: Allow overriding cross-compiling env params") we would fail to cross compile cpupower in buildroot which uses the recipe at [1] where only the CROSS variable is being set. The issue here is the use of the lazy evaluation for all variables: CC, LD, AR, STRIP, RANLIB, rather than just CROSS. [1]: https://git.buildroot.net/buildroot/tree/package/linux-tools/linux-tool-cpupower.mk.in Fixes: f79473ed9220 ("pm: cpupower: Makefile: Allow overriding cross-compiling env params") Reported-by: Florian Fainelli Closes: https://lore.kernel.org/all/2bbabd2c-24ef-493c-a199-594e5dada3da@broadcom.com/ Signed-off-by: Peng Fan Tested-by: Florian Fainelli Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 175004ce44b2..51a95239fe06 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -87,11 +87,19 @@ INSTALL_SCRIPT = ${INSTALL} -m 644 # to something more interesting, like "arm-linux-". If you want # to compile vs uClibc, that can be done here as well. CROSS ?= #/usr/i386-linux-uclibc/usr/bin/i386-uclibc- +ifneq ($(CROSS), ) +CC = $(CROSS)gcc +LD = $(CROSS)gcc +AR = $(CROSS)ar +STRIP = $(CROSS)strip +RANLIB = $(CROSS)ranlib +else CC ?= $(CROSS)gcc LD ?= $(CROSS)gcc AR ?= $(CROSS)ar STRIP ?= $(CROSS)strip RANLIB ?= $(CROSS)ranlib +endif HOSTCC = gcc MKDIR = mkdir From 46fd8c707b552c0a846917192f66e623bb03f976 Mon Sep 17 00:00:00 2001 From: wangfushuai Date: Wed, 4 Dec 2024 15:02:47 +0800 Subject: [PATCH 07/70] cpupower: revise is_valid flag handling for idle_monitor The is_valid flag should reflect the validity state of both the XXX_start and XXX_stop functions. But the use of '=' in XXX_stop overwrites the validity state set by XXX_start. This commit changes '=' to '|=' in XXX_stop to preserve and combine the validity state of XXX_start and XXX_stop. Signed-off-by: wangfushuai Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c | 4 ++-- tools/power/cpupower/utils/idle_monitor/mperf_monitor.c | 2 +- tools/power/cpupower/utils/idle_monitor/nhm_idle.c | 2 +- tools/power/cpupower/utils/idle_monitor/snb_idle.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c b/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c index 55e55b6b42f9..f5a2a326b1b7 100644 --- a/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c +++ b/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c @@ -117,7 +117,7 @@ static int hsw_ext_start(void) for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) { for (cpu = 0; cpu < cpu_count; cpu++) { - hsw_ext_get_count(num, &val, cpu); + is_valid[cpu] = !hsw_ext_get_count(num, &val, cpu); previous_count[num][cpu] = val; } } @@ -134,7 +134,7 @@ static int hsw_ext_stop(void) for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) { for (cpu = 0; cpu < cpu_count; cpu++) { - is_valid[cpu] = !hsw_ext_get_count(num, &val, cpu); + is_valid[cpu] |= !hsw_ext_get_count(num, &val, cpu); current_count[num][cpu] = val; } } diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c index ae6af354a81d..0a03573ebcc2 100644 --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c @@ -148,7 +148,7 @@ static int mperf_measure_stats(unsigned int cpu) ret = get_aperf_mperf(cpu, &aval, &mval); aperf_current_count[cpu] = aval; mperf_current_count[cpu] = mval; - is_valid[cpu] = !ret; + is_valid[cpu] |= !ret; return 0; } diff --git a/tools/power/cpupower/utils/idle_monitor/nhm_idle.c b/tools/power/cpupower/utils/idle_monitor/nhm_idle.c index 16eaf006f61f..6b1733782ffa 100644 --- a/tools/power/cpupower/utils/idle_monitor/nhm_idle.c +++ b/tools/power/cpupower/utils/idle_monitor/nhm_idle.c @@ -151,7 +151,7 @@ static int nhm_stop(void) for (num = 0; num < NHM_CSTATE_COUNT; num++) { for (cpu = 0; cpu < cpu_count; cpu++) { - is_valid[cpu] = !nhm_get_count(num, &val, cpu); + is_valid[cpu] |= !nhm_get_count(num, &val, cpu); current_count[num][cpu] = val; } } diff --git a/tools/power/cpupower/utils/idle_monitor/snb_idle.c b/tools/power/cpupower/utils/idle_monitor/snb_idle.c index 811d63ab17a7..5969b88a85b4 100644 --- a/tools/power/cpupower/utils/idle_monitor/snb_idle.c +++ b/tools/power/cpupower/utils/idle_monitor/snb_idle.c @@ -115,7 +115,7 @@ static int snb_start(void) for (num = 0; num < SNB_CSTATE_COUNT; num++) { for (cpu = 0; cpu < cpu_count; cpu++) { - snb_get_count(num, &val, cpu); + is_valid[cpu] = !snb_get_count(num, &val, cpu); previous_count[num][cpu] = val; } } @@ -132,7 +132,7 @@ static int snb_stop(void) for (num = 0; num < SNB_CSTATE_COUNT; num++) { for (cpu = 0; cpu < cpu_count; cpu++) { - is_valid[cpu] = !snb_get_count(num, &val, cpu); + is_valid[cpu] |= !snb_get_count(num, &val, cpu); current_count[num][cpu] = val; } } From e687b81f4c0e1e1d2cb84716523d7791fa34c8ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 2 Dec 2024 20:04:09 +0100 Subject: [PATCH 08/70] ACPI: BGRT: Mark bin_attribute as __ro_after_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The attribute is only modified during __init phase. Protect it against accidental or intentional modifications afterwards. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20241202-sysfs-const-bin_attr-acpi-v1-1-78f3b38d350d@weissschuh.net Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bgrt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c index d1d9c9289087..88837c291a76 100644 --- a/drivers/acpi/bgrt.c +++ b/drivers/acpi/bgrt.c @@ -29,7 +29,7 @@ BGRT_SHOW(type, image_type); BGRT_SHOW(xoffset, image_offset_x); BGRT_SHOW(yoffset, image_offset_y); -static BIN_ATTR_SIMPLE_RO(image); +static __ro_after_init BIN_ATTR_SIMPLE_RO(image); static struct attribute *bgrt_attributes[] = { &bgrt_attr_version.attr, From d16d7e91ed31f08903c698db42bbabe3dd454e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 2 Dec 2024 20:04:10 +0100 Subject: [PATCH 09/70] ACPI: BGRT: Constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20241202-sysfs-const-bin_attr-acpi-v1-2-78f3b38d350d@weissschuh.net Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bgrt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c index 88837c291a76..35ece8e9f15d 100644 --- a/drivers/acpi/bgrt.c +++ b/drivers/acpi/bgrt.c @@ -40,14 +40,14 @@ static struct attribute *bgrt_attributes[] = { NULL, }; -static struct bin_attribute *bgrt_bin_attributes[] = { +static const struct bin_attribute *const bgrt_bin_attributes[] = { &bin_attr_image, NULL, }; static const struct attribute_group bgrt_attribute_group = { .attrs = bgrt_attributes, - .bin_attrs = bgrt_bin_attributes, + .bin_attrs_new = bgrt_bin_attributes, }; int __init acpi_parse_bgrt(struct acpi_table_header *table) From 7349678b84552d15866c1a118535663e9dbb6768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 2 Dec 2024 20:04:11 +0100 Subject: [PATCH 10/70] ACPI: sysfs: Constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20241202-sysfs-const-bin_attr-acpi-v1-3-78f3b38d350d@weissschuh.net Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sysfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 687524b50085..a48ebbf768f9 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -319,7 +319,7 @@ struct acpi_data_attr { }; static ssize_t acpi_table_show(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, + const struct bin_attribute *bin_attr, char *buf, loff_t offset, size_t count) { struct acpi_table_attr *table_attr = @@ -372,7 +372,7 @@ static int acpi_table_attr_init(struct kobject *tables_obj, } table_attr->attr.size = table_header->length; - table_attr->attr.read = acpi_table_show; + table_attr->attr.read_new = acpi_table_show; table_attr->attr.attr.name = table_attr->filename; table_attr->attr.attr.mode = 0400; @@ -412,7 +412,7 @@ acpi_status acpi_sysfs_table_handler(u32 event, void *table, void *context) } static ssize_t acpi_data_show(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, + const struct bin_attribute *bin_attr, char *buf, loff_t offset, size_t count) { struct acpi_data_attr *data_attr; @@ -495,7 +495,7 @@ static int acpi_table_data_init(struct acpi_table_header *th) if (!data_attr) return -ENOMEM; sysfs_attr_init(&data_attr->attr.attr); - data_attr->attr.read = acpi_data_show; + data_attr->attr.read_new = acpi_data_show; data_attr->attr.attr.mode = 0400; return acpi_data_objs[i].fn(th, data_attr); } From bede543d2f8a045030ea6ed49307719673342613 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Dec 2024 12:24:35 +0100 Subject: [PATCH 11/70] ACPI: OSL: Use usleep_range() in acpi_os_sleep() As stated by Len in [1], the extra delay added by msleep() to the sleep time value passed to it can be significant, roughly between 1.5 ns on systems with HZ = 1000 and as much as 15 ms on systems with HZ = 100, which is hardly acceptable, at least for small sleep time values. msleep(5) on the default HZ = 250 in Ubuntu on a modern PC takes about 12 ms. This results in over 800 ms of spurious system resume delay on systems such as the Dell XPS-13-9300, which use ASL Sleep(5ms) in a tight loop. Address this by using usleep_range() in acpi_os_sleep() instead of msleep(). For short sleep times this is a no brainer, but even for long sleeps usleep_range() should be preferred because timer wheel timers are optimized for cancelation before they expire and this particular timer is not going to be canceled. Add at least 50 us on top of the requested sleep time in case the timer can be subject to coalescing, which is consistent with what's done in user space in this context [2], but for sleeps longer than 5 ms use 1% of the requested sleep time for this purpose. The rationale here is that longer sleeps don't need that much of a timer precision as a rule and making the timer a more likely candidate for coalescing in these cases is generally desirable. It starts at 5 ms so that the delta between the requested sleep time and the effective deadline is a contiuous function of the former. Link: https://lore.kernel.org/linux-pm/c7db7e804c453629c116d508558eaf46477a2d73.1731708405.git.len.brown@intel.com/ [1] Link: https://lore.kernel.org/linux-pm/CAJvTdK=Q1kwWA6Wxn8Zcf0OicDEk6cHYFAvQVizgA47mXu63+g@mail.gmail.com/ [2] Closes: https://bugzilla.kernel.org/show_bug.cgi?id=216263 Reported-by: Len Brown Reviewed-by: Hans de Goede Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/5857066.DvuYhMxLoT@rjwysocki.net --- drivers/acpi/osl.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index fed446aace42..5ff343096ece 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -607,7 +607,27 @@ acpi_status acpi_os_remove_interrupt_handler(u32 gsi, acpi_osd_handler handler) void acpi_os_sleep(u64 ms) { - msleep(ms); + u64 usec = ms * USEC_PER_MSEC, delta_us = 50; + + /* + * Use a hrtimer because the timer wheel timers are optimized for + * cancelation before they expire and this timer is not going to be + * canceled. + * + * Set the delta between the requested sleep time and the effective + * deadline to at least 50 us in case there is an opportunity for timer + * coalescing. + * + * Moreover, longer sleeps can be assumed to need somewhat less timer + * precision, so sacrifice some of it for making the timer a more likely + * candidate for coalescing by setting the delta to 1% of the sleep time + * if it is above 5 ms (this value is chosen so that the delta is a + * continuous function of the sleep time). + */ + if (ms > 5) + delta_us = (USEC_PER_MSEC / 100) * ms; + + usleep_range(usec, usec + delta_us); } void acpi_os_stall(u32 us) From 02915b4813a126ebf2e050fc4951e037d11f18b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Dec 2024 21:46:35 +0100 Subject: [PATCH 12/70] ACPI: battery: Rename extensions to hook in messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functionality is called "hook" everywhere in the code. For consistency call it the same in the log messages. The power supply subsystem is about to get its own extension functionality. While the two are closely related and will be used together, the current wording leaves room for misinterpretation. Signed-off-by: Thomas Weißschuh Reviewed-by: Sebastian Reichel Link: https://patch.msgid.link/20241205-power-supply-extensions-v5-1-f0f996db4347@weissschuh.net Signed-off-by: Rafael J. Wysocki --- drivers/acpi/battery.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 3d5342f8d7b3..6760330a8af5 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -717,7 +717,7 @@ static void battery_hook_unregister_unlocked(struct acpi_battery_hook *hook) } list_del_init(&hook->list); - pr_info("extension unregistered: %s\n", hook->name); + pr_info("hook unregistered: %s\n", hook->name); } void battery_hook_unregister(struct acpi_battery_hook *hook) @@ -751,18 +751,18 @@ void battery_hook_register(struct acpi_battery_hook *hook) if (hook->add_battery(battery->bat, hook)) { /* * If a add-battery returns non-zero, - * the registration of the extension has failed, + * the registration of the hook has failed, * and we will not add it to the list of loaded * hooks. */ - pr_err("extension failed to load: %s", hook->name); + pr_err("hook failed to load: %s", hook->name); battery_hook_unregister_unlocked(hook); goto end; } power_supply_changed(battery->bat); } - pr_info("new extension: %s\n", hook->name); + pr_info("new hook: %s\n", hook->name); end: mutex_unlock(&hook_mutex); } @@ -805,10 +805,10 @@ static void battery_hook_add_battery(struct acpi_battery *battery) list_for_each_entry_safe(hook_node, tmp, &battery_hook_list, list) { if (hook_node->add_battery(battery->bat, hook_node)) { /* - * The notification of the extensions has failed, to - * prevent further errors we will unload the extension. + * The notification of the hook has failed, to + * prevent further errors we will unload the hook. */ - pr_err("error in extension, unloading: %s", + pr_err("error in hook, unloading: %s", hook_node->name); battery_hook_unregister_unlocked(hook_node); } From 85a810f91d9bb9c4440cbe745643cec87642ae5c Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:58:00 +0800 Subject: [PATCH 13/70] powercap: intel_rapl: Add support for Panther Lake platform Add support for PantherLake platform to the RAPL common driver. Signed-off-by: Zhang Rui Link: https://patch.msgid.link/20241203075802.584741-2-rui.zhang@intel.com [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 5e793b80fd6b..77d75e1f14a9 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1265,6 +1265,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), From 3cc83aeea0cd061e424f634da62dfcabe7fdd5c5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:58:01 +0800 Subject: [PATCH 14/70] thermal: intel: int340x: processor: Enable MMIO RAPL for Panther Lake Enable MMIO RAPL support for PantherLake platform. Signed-off-by: Zhang Rui Link: https://patch.msgid.link/20241203075802.584741-3-rui.zhang@intel.com [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/processor_thermal_device.h | 1 + .../thermal/intel/int340x_thermal/processor_thermal_device_pci.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h index d5eca6db2c00..ba2d89d3024c 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h @@ -30,6 +30,7 @@ #define PCI_DEVICE_ID_INTEL_RPL_THERMAL 0xA71D #define PCI_DEVICE_ID_INTEL_SKL_THERMAL 0x1903 #define PCI_DEVICE_ID_INTEL_TGL_THERMAL 0x9A03 +#define PCI_DEVICE_ID_INTEL_PTL_THERMAL 0xB01D struct power_config { u32 index; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 145d471546d5..707bd1b23823 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -495,6 +495,7 @@ static const struct pci_device_id proc_thermal_pci_ids[] = { PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_DLVR | PROC_THERMAL_FEATURE_WT_HINT) }, { PCI_DEVICE_DATA(INTEL, RPL_THERMAL, PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_FIVR | PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_WT_REQ) }, + { PCI_DEVICE_DATA(INTEL, PTL_THERMAL, PROC_THERMAL_FEATURE_RAPL) }, { }, }; From 3fd3697ebfb49ee218416340b084208377ea081d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:58:02 +0800 Subject: [PATCH 15/70] ACPI: DPTF: Support Panther Lake Add Panther Lake ACPI IDs for DPTF. Signed-off-by: Zhang Rui Link: https://patch.msgid.link/20241203075802.584741-4-rui.zhang@intel.com [ rjw: Changelog edit ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/dptf/dptf_pch_fivr.c | 1 + drivers/acpi/dptf/dptf_power.c | 2 ++ drivers/acpi/dptf/int340x_thermal.c | 6 ++++++ drivers/acpi/fan.h | 1 + drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 1 + drivers/thermal/intel/int340x_thermal/int3403_thermal.c | 1 + 6 files changed, 12 insertions(+) diff --git a/drivers/acpi/dptf/dptf_pch_fivr.c b/drivers/acpi/dptf/dptf_pch_fivr.c index 624fce67ce43..952216c67d58 100644 --- a/drivers/acpi/dptf/dptf_pch_fivr.c +++ b/drivers/acpi/dptf/dptf_pch_fivr.c @@ -152,6 +152,7 @@ static const struct acpi_device_id pch_fivr_device_ids[] = { {"INTC1064", 0}, {"INTC106B", 0}, {"INTC10A3", 0}, + {"INTC10D7", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, pch_fivr_device_ids); diff --git a/drivers/acpi/dptf/dptf_power.c b/drivers/acpi/dptf/dptf_power.c index 3d3edd81b172..e8caf4106ff9 100644 --- a/drivers/acpi/dptf/dptf_power.c +++ b/drivers/acpi/dptf/dptf_power.c @@ -236,6 +236,8 @@ static const struct acpi_device_id int3407_device_ids[] = { {"INTC106D", 0}, {"INTC10A4", 0}, {"INTC10A5", 0}, + {"INTC10D8", 0}, + {"INTC10D9", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, int3407_device_ids); diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c index 014ada759954..aef7aca2161d 100644 --- a/drivers/acpi/dptf/int340x_thermal.c +++ b/drivers/acpi/dptf/int340x_thermal.c @@ -55,6 +55,12 @@ static const struct acpi_device_id int340x_thermal_device_ids[] = { {"INTC10A3"}, {"INTC10A4"}, {"INTC10A5"}, + {"INTC10D4"}, + {"INTC10D5"}, + {"INTC10D6"}, + {"INTC10D7"}, + {"INTC10D8"}, + {"INTC10D9"}, {""}, }; diff --git a/drivers/acpi/fan.h b/drivers/acpi/fan.h index db25a3898af7..488b51e2cb31 100644 --- a/drivers/acpi/fan.h +++ b/drivers/acpi/fan.h @@ -19,6 +19,7 @@ {"INTC1063", }, /* Fan for Meteor Lake generation */ \ {"INTC106A", }, /* Fan for Lunar Lake generation */ \ {"INTC10A2", }, /* Fan for Raptor Lake generation */ \ + {"INTC10D6", }, /* Fan for Panther Lake generation */ \ {"PNP0C0B", } /* Generic ACPI fan */ #define ACPI_FPS_NAME_LEN 20 diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 8660ef2175be..5805e08d71be 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -690,6 +690,7 @@ static const struct acpi_device_id int3400_thermal_match[] = { {"INTC1042", 0}, {"INTC1068", 0}, {"INTC10A0", 0}, + {"INTC10D4", 0}, {} }; diff --git a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c index 04aa0afb3b1d..5a925a8df7b3 100644 --- a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c @@ -275,6 +275,7 @@ static const struct acpi_device_id int3403_device_ids[] = { {"INTC1062", 0}, {"INTC1069", 0}, {"INTC10A1", 0}, + {"INTC10D5", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, int3403_device_ids); From eeed4bfbe9b96214162a09a7fbb7570fa9522ca4 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 3 Dec 2024 15:03:06 +0200 Subject: [PATCH 16/70] intel_idle: add Clearwater Forest SoC support Clearwater Forest (CWF) SoC has the same C-states as Sierra Forest (SRF) SoC. Add CWF support by re-using the SRF C-states table. Note: it is expected that CWF C-states will have same or very similar characteristics as SRF C-states (latency and target residency). However, there is a possibility that the characteristics will end up being different enough when the CWF platform development is finished. In that case, a separate CWF C-states table will be created and populated with the CWF-specific characteristics (latency and target residency). Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20241203130306.1559024-1-artem.bityutskiy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ac4d8faa3886..23d0cd27a581 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1651,6 +1651,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &idle_cpu_snr), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &idle_cpu_grr), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &idle_cpu_srf), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &idle_cpu_srf), {} }; From 9b18d536b124357fee56d82b1462c02f78d219e5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Dec 2024 12:39:05 +0100 Subject: [PATCH 17/70] cpufreq: intel_pstate: Use CPPC to get scaling factors The perf-to-frequency scaling factors are used by intel_pstate on hybrid platforms to cast performance levels to frequency on different types of CPUs which is needed because the generic cpufreq sysfs interface works in the frequency domain. For some hybrid platforms already in the field, the scaling factors are known, but for others (including some upcoming ones) they most likely will be different and the only way to get them that scales is to use information provided by the platform firmware. In this particular case, the requisite information can be obtained via CPPC. If the P-core hybrid scaling factor for the given processor model is not known, use CPPC to compute hybrid scaling factors for all CPUs. Since the current default hybrid scaling factor is only suitable for a few early hybrid platforms, add intel_hybrid_scaling_factor[] entries for them and initialize the scaling factor to zero ("unknown") by default. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/8476313.T7Z3S40VBb@rjwysocki.net --- drivers/cpufreq/intel_pstate.c | 57 ++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b8e2396a708a..e16b27c35cfb 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -302,11 +303,11 @@ static bool hwp_is_hybrid; static struct cpufreq_driver *intel_pstate_driver __read_mostly; -#define HYBRID_SCALING_FACTOR 78741 +#define HYBRID_SCALING_FACTOR_ADL 78741 #define HYBRID_SCALING_FACTOR_MTL 80000 #define HYBRID_SCALING_FACTOR_LNL 86957 -static int hybrid_scaling_factor = HYBRID_SCALING_FACTOR; +static int hybrid_scaling_factor; static inline int core_get_scaling(void) { @@ -414,18 +415,15 @@ static int intel_pstate_get_cppc_guaranteed(int cpu) static int intel_pstate_cppc_get_scaling(int cpu) { struct cppc_perf_caps cppc_perf; - int ret; - - ret = cppc_get_perf_caps(cpu, &cppc_perf); /* - * If the nominal frequency and the nominal performance are not - * zero and the ratio between them is not 100, return the hybrid - * scaling factor. + * Compute the perf-to-frequency scaling factor for the given CPU if + * possible, unless it would be 0. */ - if (!ret && cppc_perf.nominal_perf && cppc_perf.nominal_freq && - cppc_perf.nominal_perf * 100 != cppc_perf.nominal_freq) - return hybrid_scaling_factor; + if (!cppc_get_perf_caps(cpu, &cppc_perf) && + cppc_perf.nominal_perf && cppc_perf.nominal_freq) + return div_u64(cppc_perf.nominal_freq * KHZ_PER_MHZ, + cppc_perf.nominal_perf); return core_get_scaling(); } @@ -2211,24 +2209,30 @@ static void hybrid_get_type(void *data) static int hwp_get_cpu_scaling(int cpu) { - u8 cpu_type = 0; + if (hybrid_scaling_factor) { + u8 cpu_type = 0; - smp_call_function_single(cpu, hybrid_get_type, &cpu_type, 1); - /* P-cores have a smaller perf level-to-freqency scaling factor. */ - if (cpu_type == 0x40) - return hybrid_scaling_factor; + smp_call_function_single(cpu, hybrid_get_type, &cpu_type, 1); - /* Use default core scaling for E-cores */ - if (cpu_type == 0x20) + /* + * Return the hybrid scaling factor for P-cores and use the + * default core scaling for E-cores. + */ + if (cpu_type == 0x40) + return hybrid_scaling_factor; + + if (cpu_type == 0x20) + return core_get_scaling(); + } + + /* Use core scaling on non-hybrid systems. */ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) return core_get_scaling(); /* - * If reached here, this system is either non-hybrid (like Tiger - * Lake) or hybrid-capable (like Alder Lake or Raptor Lake) with - * no E cores (in which case CPUID for hybrid support is 0). - * - * The CPPC nominal_frequency field is 0 for non-hybrid systems, - * so the default core scaling will be used for them. + * The system is hybrid, but the hybrid scaling factor is not known or + * the CPU type is not one of the above, so use CPPC to compute the + * scaling factor for this CPU. */ return intel_pstate_cppc_get_scaling(cpu); } @@ -3665,6 +3669,11 @@ static const struct x86_cpu_id intel_epp_default[] = { }; static const struct x86_cpu_id intel_hybrid_scaling_factor[] = { + X86_MATCH_VFM(INTEL_ALDERLAKE, HYBRID_SCALING_FACTOR_ADL), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, HYBRID_SCALING_FACTOR_ADL), + X86_MATCH_VFM(INTEL_RAPTORLAKE, HYBRID_SCALING_FACTOR_ADL), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, HYBRID_SCALING_FACTOR_ADL), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, HYBRID_SCALING_FACTOR_ADL), X86_MATCH_VFM(INTEL_METEORLAKE_L, HYBRID_SCALING_FACTOR_MTL), X86_MATCH_VFM(INTEL_ARROWLAKE, HYBRID_SCALING_FACTOR_MTL), X86_MATCH_VFM(INTEL_LUNARLAKE_M, HYBRID_SCALING_FACTOR_LNL), From 20e20f83dd88a25c1e4ab0a3838e9a4ce583c30d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Dec 2024 12:40:19 +0100 Subject: [PATCH 18/70] cpufreq: intel_pstate: Drop Arrow Lake from "scaling factor" list Since HYBRID_SCALING_FACTOR_MTL is not going to be suitable for Arrow Lake in general, drop it from the "known hybrid scaling factors" list of platforms, so the scaling factor for it will be determined with the help of information provided by the platform firmware via CPPC. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2307515.iZASKD2KPV@rjwysocki.net --- drivers/cpufreq/intel_pstate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index e16b27c35cfb..9e14374498d6 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3675,7 +3675,6 @@ static const struct x86_cpu_id intel_hybrid_scaling_factor[] = { X86_MATCH_VFM(INTEL_RAPTORLAKE_P, HYBRID_SCALING_FACTOR_ADL), X86_MATCH_VFM(INTEL_RAPTORLAKE_S, HYBRID_SCALING_FACTOR_ADL), X86_MATCH_VFM(INTEL_METEORLAKE_L, HYBRID_SCALING_FACTOR_MTL), - X86_MATCH_VFM(INTEL_ARROWLAKE, HYBRID_SCALING_FACTOR_MTL), X86_MATCH_VFM(INTEL_LUNARLAKE_M, HYBRID_SCALING_FACTOR_LNL), {} }; From 16c977f8177f9c2ecb88319c944722107c952731 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 4 Dec 2024 14:48:38 +0000 Subject: [PATCH 19/70] cpufreq/amd-pstate: Convert the amd_pstate_get/set_epp() to static calls MSR and shared memory based systems have different mechanisms to get and set the epp value. Split those mechanisms into different functions and assign them appropriately to the static calls at boot time. This eliminates the need for the "if(cpu_feature_enabled(X86_FEATURE_CPPC))" checks at runtime. Also, propagate the error code from rdmsrl_on_cpu() and cppc_get_epp_perf() to *_get_epp()'s caller, instead of returning -EIO unconditionally. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241204144842.164178-2-Dhananjay.Ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 94 +++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 33 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 66e5dfc711c0..bc42d96984f4 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -180,26 +180,40 @@ static inline int get_mode_idx_from_str(const char *str, size_t size) static DEFINE_MUTEX(amd_pstate_limits_lock); static DEFINE_MUTEX(amd_pstate_driver_lock); -static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) +static s16 msr_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) { u64 epp; int ret; - if (cpu_feature_enabled(X86_FEATURE_CPPC)) { - if (!cppc_req_cached) { - epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, - &cppc_req_cached); - if (epp) - return epp; - } - epp = (cppc_req_cached >> 24) & 0xFF; - } else { - ret = cppc_get_epp_perf(cpudata->cpu, &epp); + if (!cppc_req_cached) { + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req_cached); if (ret < 0) { pr_debug("Could not retrieve energy perf value (%d)\n", ret); - return -EIO; + return ret; } } + epp = (cppc_req_cached >> 24) & 0xFF; + + return (s16)epp; +} + +DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp); + +static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) +{ + return static_call(amd_pstate_get_epp)(cpudata, cppc_req_cached); +} + +static s16 shmem_get_epp(struct amd_cpudata *cpudata, u64 dummy) +{ + u64 epp; + int ret; + + ret = cppc_get_epp_perf(cpudata->cpu, &epp); + if (ret < 0) { + pr_debug("Could not retrieve energy perf value (%d)\n", ret); + return ret; + } return (s16)(epp & 0xff); } @@ -253,33 +267,45 @@ static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, max_perf, fast_switch); } -static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) +static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) +{ + int ret; + + u64 value = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + if (!ret) + cpudata->epp_cached = epp; + + return ret; +} + +DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp); + +static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) +{ + return static_call(amd_pstate_set_epp)(cpudata, epp); +} + +static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) { int ret; struct cppc_perf_ctrls perf_ctrls; - if (cpu_feature_enabled(X86_FEATURE_CPPC)) { - u64 value = READ_ONCE(cpudata->cppc_req_cached); + amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, + cpudata->max_limit_perf, false); - value &= ~GENMASK_ULL(31, 24); - value |= (u64)epp << 24; - WRITE_ONCE(cpudata->cppc_req_cached, value); - - ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); - if (!ret) - cpudata->epp_cached = epp; - } else { - amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, - cpudata->max_limit_perf, false); - - perf_ctrls.energy_perf = epp; - ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); - if (ret) { - pr_debug("failed to set energy perf value (%d)\n", ret); - return ret; - } - cpudata->epp_cached = epp; + perf_ctrls.energy_perf = epp; + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + if (ret) { + pr_debug("failed to set energy perf value (%d)\n", ret); + return ret; } + cpudata->epp_cached = epp; return ret; } @@ -1869,6 +1895,8 @@ static int __init amd_pstate_init(void) static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable); static_call_update(amd_pstate_init_perf, shmem_init_perf); static_call_update(amd_pstate_update_perf, shmem_update_perf); + static_call_update(amd_pstate_get_epp, shmem_get_epp); + static_call_update(amd_pstate_set_epp, shmem_set_epp); } if (amd_pstate_prefcore) { From 57a2b25e45cd40eaa2e505452384fa1b7248895a Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 4 Dec 2024 14:48:39 +0000 Subject: [PATCH 20/70] cpufreq/amd-pstate: Move the invocation of amd_pstate_update_perf() amd_pstate_update_perf() should not be a part of shmem_set_epp() function, so move it to the amd_pstate_epp_update_limit() function, where it is needed. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241204144842.164178-3-Dhananjay.Ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index bc42d96984f4..bd3e0f113a88 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -296,9 +296,6 @@ static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) int ret; struct cppc_perf_ctrls perf_ctrls; - amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, - cpudata->max_limit_perf, false); - perf_ctrls.energy_perf = epp; ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); if (ret) { @@ -1600,6 +1597,10 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) epp = 0; WRITE_ONCE(cpudata->cppc_req_cached, value); + + amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, + cpudata->max_limit_perf, false); + return amd_pstate_set_epp(cpudata, epp); } From b1089e0c8817fda93d474eaa82ad86386887aefe Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 4 Dec 2024 14:48:40 +0000 Subject: [PATCH 21/70] cpufreq/amd-pstate: Refactor amd_pstate_epp_reenable() and amd_pstate_epp_offline() Replace similar code chunks with amd_pstate_update_perf() and amd_pstate_set_epp() function calls. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241204144842.164178-4-Dhananjay.Ugwekar@amd.com [ML: Fix LKP reported error about unused variable] Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index bd3e0f113a88..23c5840dc406 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1632,25 +1632,17 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) { - struct cppc_perf_ctrls perf_ctrls; - u64 value, max_perf; + u64 max_perf; int ret; ret = amd_pstate_cppc_enable(true); if (ret) pr_err("failed to enable amd pstate during resume, return %d\n", ret); - value = READ_ONCE(cpudata->cppc_req_cached); max_perf = READ_ONCE(cpudata->highest_perf); - if (cpu_feature_enabled(X86_FEATURE_CPPC)) { - wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); - } else { - perf_ctrls.max_perf = max_perf; - cppc_set_perf(cpudata->cpu, &perf_ctrls); - perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); - cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); - } + amd_pstate_update_perf(cpudata, 0, 0, max_perf, false); + amd_pstate_set_epp(cpudata, cpudata->epp_cached); } static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) @@ -1670,31 +1662,15 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) static void amd_pstate_epp_offline(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - struct cppc_perf_ctrls perf_ctrls; int min_perf; - u64 value; min_perf = READ_ONCE(cpudata->lowest_perf); - value = READ_ONCE(cpudata->cppc_req_cached); mutex_lock(&amd_pstate_limits_lock); - if (cpu_feature_enabled(X86_FEATURE_CPPC)) { - cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; - /* Set max perf same as min perf */ - value &= ~AMD_CPPC_MAX_PERF(~0L); - value |= AMD_CPPC_MAX_PERF(min_perf); - value &= ~AMD_CPPC_MIN_PERF(~0L); - value |= AMD_CPPC_MIN_PERF(min_perf); - wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); - } else { - perf_ctrls.desired_perf = 0; - perf_ctrls.min_perf = min_perf; - perf_ctrls.max_perf = min_perf; - cppc_set_perf(cpudata->cpu, &perf_ctrls); - perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); - cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); - } + amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false); + amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); + mutex_unlock(&amd_pstate_limits_lock); } From b78f8c87ec3e7499bb049986838636d3afbc7ece Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 4 Dec 2024 14:48:41 +0000 Subject: [PATCH 22/70] cpufreq/amd-pstate: Remove the cppc_state check in offline/online functions Only amd_pstate_epp driver (i.e. cppc_state = ACTIVE) enters the amd_pstate_epp_offline() and amd_pstate_epp_cpu_online() functions, so remove the unnecessary if condition checking if cppc_state is equal to AMD_PSTATE_ACTIVE. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241204144842.164178-5-Dhananjay.Ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 23c5840dc406..9b8d7f299fca 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1651,10 +1651,8 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); - if (cppc_state == AMD_PSTATE_ACTIVE) { - amd_pstate_epp_reenable(cpudata); - cpudata->suspended = false; - } + amd_pstate_epp_reenable(cpudata); + cpudata->suspended = false; return 0; } @@ -1683,8 +1681,7 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) if (cpudata->suspended) return 0; - if (cppc_state == AMD_PSTATE_ACTIVE) - amd_pstate_epp_offline(policy); + amd_pstate_epp_offline(policy); return 0; } From 53ec2101dfede8fecdd240662281a12e537c3411 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 4 Dec 2024 14:48:42 +0000 Subject: [PATCH 23/70] cpufreq/amd-pstate: Merge amd_pstate_epp_cpu_offline() and amd_pstate_epp_offline() amd_pstate_epp_offline() is only called from within amd_pstate_epp_cpu_offline() and doesn't make much sense to have it at all. Hence, remove it. Also remove the unncessary debug print in the offline path while at it. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20241204144842.164178-6-Dhananjay.Ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 9b8d7f299fca..8ce754ead328 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1657,11 +1657,14 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) return 0; } -static void amd_pstate_epp_offline(struct cpufreq_policy *policy) +static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; int min_perf; + if (cpudata->suspended) + return 0; + min_perf = READ_ONCE(cpudata->lowest_perf); mutex_lock(&amd_pstate_limits_lock); @@ -1670,18 +1673,6 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy) amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); mutex_unlock(&amd_pstate_limits_lock); -} - -static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) -{ - struct amd_cpudata *cpudata = policy->driver_data; - - pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu); - - if (cpudata->suspended) - return 0; - - amd_pstate_epp_offline(policy); return 0; } From 4dcd130151a654108a414b298df9e21a0d3575c9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:36 -0600 Subject: [PATCH 24/70] cpufreq/amd-pstate: Add trace event for EPP perf updates In "active" mode the most important thing for debugging whether an issue is hardware or software based is to look at what was the last thing written to the CPPC request MSR or shared memory region. The 'amd_pstate_epp_perf' trace event shows the values being written for all CPUs. Reviewed-by: Perry Yuan Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-4-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-trace.h | 45 ++++++++++++++++++++++++++++++ drivers/cpufreq/amd-pstate.c | 28 +++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h index 35f38ae67fb1..e2221a4b6901 100644 --- a/drivers/cpufreq/amd-pstate-trace.h +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -88,6 +88,51 @@ TRACE_EVENT(amd_pstate_perf, ) ); +TRACE_EVENT(amd_pstate_epp_perf, + + TP_PROTO(unsigned int cpu_id, + unsigned int highest_perf, + unsigned int epp, + unsigned int min_perf, + unsigned int max_perf, + bool boost + ), + + TP_ARGS(cpu_id, + highest_perf, + epp, + min_perf, + max_perf, + boost), + + TP_STRUCT__entry( + __field(unsigned int, cpu_id) + __field(unsigned int, highest_perf) + __field(unsigned int, epp) + __field(unsigned int, min_perf) + __field(unsigned int, max_perf) + __field(bool, boost) + ), + + TP_fast_assign( + __entry->cpu_id = cpu_id; + __entry->highest_perf = highest_perf; + __entry->epp = epp; + __entry->min_perf = min_perf; + __entry->max_perf = max_perf; + __entry->boost = boost; + ), + + TP_printk("cpu%u: [%u<->%u]/%u, epp=%u, boost=%u", + (unsigned int)__entry->cpu_id, + (unsigned int)__entry->min_perf, + (unsigned int)__entry->max_perf, + (unsigned int)__entry->highest_perf, + (unsigned int)__entry->epp, + (bool)__entry->boost + ) +); + #endif /* _AMD_PSTATE_TRACE_H */ /* This part must be outside protection */ diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 8ce754ead328..20d52bce1882 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -324,6 +324,14 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, return -EBUSY; } + if (trace_amd_pstate_epp_perf_enabled()) { + trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, + epp, + AMD_CPPC_MIN_PERF(cpudata->cppc_req_cached), + AMD_CPPC_MAX_PERF(cpudata->cppc_req_cached), + cpudata->boost_state); + } + ret = amd_pstate_set_epp(cpudata, epp); return ret; @@ -1598,6 +1606,13 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) WRITE_ONCE(cpudata->cppc_req_cached, value); + if (trace_amd_pstate_epp_perf_enabled()) { + trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp, + cpudata->min_limit_perf, + cpudata->max_limit_perf, + policy->boost_enabled); + } + amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, cpudata->max_limit_perf, false); @@ -1641,6 +1656,13 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) max_perf = READ_ONCE(cpudata->highest_perf); + if (trace_amd_pstate_epp_perf_enabled()) { + trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, + cpudata->epp_cached, + AMD_CPPC_MIN_PERF(cpudata->cppc_req_cached), + max_perf, cpudata->boost_state); + } + amd_pstate_update_perf(cpudata, 0, 0, max_perf, false); amd_pstate_set_epp(cpudata, cpudata->epp_cached); } @@ -1669,6 +1691,12 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) mutex_lock(&amd_pstate_limits_lock); + if (trace_amd_pstate_epp_perf_enabled()) { + trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, + AMD_CPPC_EPP_BALANCE_POWERSAVE, + min_perf, min_perf, policy->boost_enabled); + } + amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false); amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); From 6c093d5a5b73ec1caf1e706510ae6031af2f9d43 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:37 -0600 Subject: [PATCH 25/70] cpufreq/amd-pstate: convert mutex use to guard() Using scoped guard declaration will unlock mutexes automatically. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-5-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 20d52bce1882..bcb9367aa9ca 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -758,12 +758,12 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) pr_err("Boost mode is not supported by this processor or SBIOS\n"); return -EOPNOTSUPP; } - mutex_lock(&amd_pstate_driver_lock); + guard(mutex)(&amd_pstate_driver_lock); + ret = amd_pstate_cpu_boost_update(policy, state); WRITE_ONCE(cpudata->boost_state, !ret ? state : false); policy->boost_enabled = !ret ? state : false; refresh_frequency_limits(policy); - mutex_unlock(&amd_pstate_driver_lock); return ret; } @@ -854,7 +854,8 @@ static void amd_pstate_update_limits(unsigned int cpu) if (!amd_pstate_prefcore) return; - mutex_lock(&amd_pstate_driver_lock); + guard(mutex)(&amd_pstate_driver_lock); + ret = amd_get_highest_perf(cpu, &cur_high); if (ret) goto free_cpufreq_put; @@ -874,7 +875,6 @@ free_cpufreq_put: if (!highest_perf_changed) cpufreq_update_policy(cpu); - mutex_unlock(&amd_pstate_driver_lock); } /* @@ -1203,11 +1203,11 @@ static ssize_t store_energy_performance_preference( if (ret < 0) return -EINVAL; - mutex_lock(&amd_pstate_limits_lock); - ret = amd_pstate_set_energy_pref_index(cpudata, ret); - mutex_unlock(&amd_pstate_limits_lock); + guard(mutex)(&amd_pstate_limits_lock); - return ret ?: count; + ret = amd_pstate_set_energy_pref_index(cpudata, ret); + + return ret ? ret : count; } static ssize_t show_energy_performance_preference( @@ -1371,13 +1371,10 @@ EXPORT_SYMBOL_GPL(amd_pstate_update_status); static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *buf) { - ssize_t ret; - mutex_lock(&amd_pstate_driver_lock); - ret = amd_pstate_show_status(buf); - mutex_unlock(&amd_pstate_driver_lock); + guard(mutex)(&amd_pstate_driver_lock); - return ret; + return amd_pstate_show_status(buf); } static ssize_t status_store(struct device *a, struct device_attribute *b, @@ -1386,9 +1383,8 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, char *p = memchr(buf, '\n', count); int ret; - mutex_lock(&amd_pstate_driver_lock); + guard(mutex)(&amd_pstate_driver_lock); ret = amd_pstate_update_status(buf, p ? p - buf : count); - mutex_unlock(&amd_pstate_driver_lock); return ret < 0 ? ret : count; } @@ -1689,7 +1685,7 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) min_perf = READ_ONCE(cpudata->lowest_perf); - mutex_lock(&amd_pstate_limits_lock); + guard(mutex)(&amd_pstate_limits_lock); if (trace_amd_pstate_epp_perf_enabled()) { trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, @@ -1700,8 +1696,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false); amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); - mutex_unlock(&amd_pstate_limits_lock); - return 0; } @@ -1730,13 +1724,11 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) struct amd_cpudata *cpudata = policy->driver_data; if (cpudata->suspended) { - mutex_lock(&amd_pstate_limits_lock); + guard(mutex)(&amd_pstate_limits_lock); /* enable amd pstate from suspend state*/ amd_pstate_epp_reenable(cpudata); - mutex_unlock(&amd_pstate_limits_lock); - cpudata->suspended = false; } From 3b43739824a6b617d8213dd2bce6fb1b2747c377 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:38 -0600 Subject: [PATCH 26/70] cpufreq/amd-pstate: Drop cached epp_policy variable epp_policy is not used by any of the current code and there is no need to cache it. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-6-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 3 --- drivers/cpufreq/amd-pstate.h | 2 -- 2 files changed, 5 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index bcb9367aa9ca..0e77584dfade 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1478,7 +1478,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) return -ENOMEM; cpudata->cpu = policy->cpu; - cpudata->epp_policy = 0; ret = amd_pstate_init_perf(cpudata); if (ret) @@ -1585,8 +1584,6 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) value &= ~AMD_CPPC_DES_PERF(~0L); value |= AMD_CPPC_DES_PERF(0); - cpudata->epp_policy = cpudata->policy; - /* Get BIOS pre-defined epp value */ epp = amd_pstate_get_epp(cpudata, value); if (epp < 0) { diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index cd573bc6b6db..7765c82f975c 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -57,7 +57,6 @@ struct amd_aperf_mperf { * @hw_prefcore: check whether HW supports preferred core featue. * Only when hw_prefcore and early prefcore param are true, * AMD P-State driver supports preferred core featue. - * @epp_policy: Last saved policy used to set energy-performance preference * @epp_cached: Cached CPPC energy-performance preference value * @policy: Cpufreq policy value * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value @@ -94,7 +93,6 @@ struct amd_cpudata { bool hw_prefcore; /* EPP feature related attributes*/ - s16 epp_policy; s16 epp_cached; u32 policy; u64 cppc_cap1_cached; From 88a95ba066a962d4d39c6a36b18bf665f51d3767 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:39 -0600 Subject: [PATCH 27/70] cpufreq/amd-pstate: Use FIELD_PREP and FIELD_GET macros The FIELD_PREP and FIELD_GET macros improve readability and help to avoid shifting bugs. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-7-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 51 ++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 0e77584dfade..fbd1b36846c5 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -22,6 +22,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -88,6 +89,11 @@ static bool cppc_enabled; static bool amd_pstate_prefcore = true; static struct quirk_entry *quirks; +#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) +#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8) +#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) +#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) + /* * AMD Energy Preference Performance (EPP) * The EPP is used in the CCLK DPM controller to drive @@ -182,7 +188,6 @@ static DEFINE_MUTEX(amd_pstate_driver_lock); static s16 msr_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) { - u64 epp; int ret; if (!cppc_req_cached) { @@ -192,9 +197,8 @@ static s16 msr_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) return ret; } } - epp = (cppc_req_cached >> 24) & 0xFF; - return (s16)epp; + return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cppc_req_cached); } DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp); @@ -269,12 +273,11 @@ static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) { + u64 value = READ_ONCE(cpudata->cppc_req_cached); int ret; - u64 value = READ_ONCE(cpudata->cppc_req_cached); - - value &= ~GENMASK_ULL(31, 24); - value |= (u64)epp << 24; + value &= ~AMD_CPPC_EPP_PERF_MASK; + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); WRITE_ONCE(cpudata->cppc_req_cached, value); ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); @@ -327,8 +330,8 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, if (trace_amd_pstate_epp_perf_enabled()) { trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp, - AMD_CPPC_MIN_PERF(cpudata->cppc_req_cached), - AMD_CPPC_MAX_PERF(cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), cpudata->boost_state); } @@ -542,18 +545,15 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, des_perf = 0; } - value &= ~AMD_CPPC_MIN_PERF(~0L); - value |= AMD_CPPC_MIN_PERF(min_perf); - - value &= ~AMD_CPPC_DES_PERF(~0L); - value |= AMD_CPPC_DES_PERF(des_perf); - /* limit the max perf when core performance boost feature is disabled */ if (!cpudata->boost_supported) max_perf = min_t(unsigned long, nominal_perf, max_perf); - value &= ~AMD_CPPC_MAX_PERF(~0L); - value |= AMD_CPPC_MAX_PERF(max_perf); + value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | + AMD_CPPC_DES_PERF_MASK); + value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); + value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf); + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, @@ -1573,16 +1573,11 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) min_perf = min(cpudata->nominal_perf, max_perf); - /* Initial min/max values for CPPC Performance Controls Register */ - value &= ~AMD_CPPC_MIN_PERF(~0L); - value |= AMD_CPPC_MIN_PERF(min_perf); - - value &= ~AMD_CPPC_MAX_PERF(~0L); - value |= AMD_CPPC_MAX_PERF(max_perf); - - /* CPPC EPP feature require to set zero to the desire perf bit */ - value &= ~AMD_CPPC_DES_PERF(~0L); - value |= AMD_CPPC_DES_PERF(0); + value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | + AMD_CPPC_DES_PERF_MASK); + value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); + value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, 0); + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); /* Get BIOS pre-defined epp value */ epp = amd_pstate_get_epp(cpudata, value); @@ -1652,7 +1647,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) if (trace_amd_pstate_epp_perf_enabled()) { trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, cpudata->epp_cached, - AMD_CPPC_MIN_PERF(cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), max_perf, cpudata->boost_state); } From 474e7218e81e7932ed18f91969b72169005ff038 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:40 -0600 Subject: [PATCH 28/70] cpufreq/amd-pstate: Only update the cached value in msr_set_epp() on success If writing the MSR MSR_AMD_CPPC_REQ fails then the cached value in the amd_cpudata structure should not be updated. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-8-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index fbd1b36846c5..ebfc9e20b6cb 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -278,11 +278,15 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) value &= ~AMD_CPPC_EPP_PERF_MASK; value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); - WRITE_ONCE(cpudata->cppc_req_cached, value); ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); - if (!ret) - cpudata->epp_cached = epp; + if (ret) { + pr_err("failed to set energy perf value (%d)\n", ret); + return ret; + } + + cpudata->epp_cached = epp; + WRITE_ONCE(cpudata->cppc_req_cached, value); return ret; } From 68cb0e77b6439fea64c6907c563b7bd27f2ee57f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:41 -0600 Subject: [PATCH 29/70] cpufreq/amd-pstate: store all values in cpudata struct in khz Storing values in the cpudata structure in different units leads to confusion and hardcoded conversions elsewhere. After ratios are calculated store everything in khz for any future use. Adjust all relevant consumers for this change as well. Suggested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-9-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 12 +++++------- drivers/cpufreq/amd-pstate.c | 28 ++++++++++++++-------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index a261d7300951..3a0a380c3590 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -207,7 +207,6 @@ static void amd_pstate_ut_check_freq(u32 index) int cpu = 0; struct cpufreq_policy *policy = NULL; struct amd_cpudata *cpudata = NULL; - u32 nominal_freq_khz; for_each_possible_cpu(cpu) { policy = cpufreq_cpu_get(cpu); @@ -215,14 +214,13 @@ static void amd_pstate_ut_check_freq(u32 index) break; cpudata = policy->driver_data; - nominal_freq_khz = cpudata->nominal_freq*1000; - if (!((cpudata->max_freq >= nominal_freq_khz) && - (nominal_freq_khz > cpudata->lowest_nonlinear_freq) && + if (!((cpudata->max_freq >= cpudata->nominal_freq) && + (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && (cpudata->lowest_nonlinear_freq > cpudata->min_freq) && (cpudata->min_freq > 0))) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", - __func__, cpu, cpudata->max_freq, nominal_freq_khz, + __func__, cpu, cpudata->max_freq, cpudata->nominal_freq, cpudata->lowest_nonlinear_freq, cpudata->min_freq); goto skip_test; } @@ -236,13 +234,13 @@ static void amd_pstate_ut_check_freq(u32 index) if (cpudata->boost_supported) { if ((policy->max == cpudata->max_freq) || - (policy->max == nominal_freq_khz)) + (policy->max == cpudata->nominal_freq)) amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; else { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", __func__, cpu, policy->max, cpudata->max_freq, - nominal_freq_khz); + cpudata->nominal_freq); goto skip_test; } } else { diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ebfc9e20b6cb..bf2512e1f117 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -739,8 +739,8 @@ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) if (on) policy->cpuinfo.max_freq = max_freq; - else if (policy->cpuinfo.max_freq > nominal_freq * 1000) - policy->cpuinfo.max_freq = nominal_freq * 1000; + else if (policy->cpuinfo.max_freq > nominal_freq) + policy->cpuinfo.max_freq = nominal_freq; policy->max = policy->cpuinfo.max_freq; @@ -940,29 +940,29 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) return ret; if (quirks && quirks->lowest_freq) - min_freq = quirks->lowest_freq * 1000; + min_freq = quirks->lowest_freq; else - min_freq = cppc_perf.lowest_freq * 1000; + min_freq = cppc_perf.lowest_freq; if (quirks && quirks->nominal_freq) - nominal_freq = quirks->nominal_freq ; + nominal_freq = quirks->nominal_freq; else nominal_freq = cppc_perf.nominal_freq; nominal_perf = READ_ONCE(cpudata->nominal_perf); boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); - max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT); lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, nominal_perf); - lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT); - WRITE_ONCE(cpudata->min_freq, min_freq); - WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); - WRITE_ONCE(cpudata->nominal_freq, nominal_freq); - WRITE_ONCE(cpudata->max_freq, max_freq); + WRITE_ONCE(cpudata->min_freq, min_freq * 1000); + WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000); + WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000); + WRITE_ONCE(cpudata->max_freq, max_freq * 1000); /** * Below values need to be initialized correctly, otherwise driver will fail to load @@ -972,13 +972,13 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) */ if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) { pr_err("min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect\n", - min_freq, max_freq, nominal_freq * 1000); + min_freq, max_freq, nominal_freq); return -EINVAL; } - if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq * 1000) { + if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq) { pr_err("lowest_nonlinear_freq(%d) value is out of range [min_freq(%d), nominal_freq(%d)]\n", - lowest_nonlinear_freq, min_freq, nominal_freq * 1000); + lowest_nonlinear_freq, min_freq, nominal_freq); return -EINVAL; } From 942718f2a236cb3b27d2dbb5942538681b6e0e88 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:42 -0600 Subject: [PATCH 30/70] cpufreq/amd-pstate: Change amd_pstate_update_perf() to return an int As msr_update_perf() calls an MSR it's possible that it fails. Pass this return code up to the caller. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-10-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index bf2512e1f117..d279ace500d7 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -251,24 +251,26 @@ static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) return index; } -static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, +static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch) { - if (fast_switch) + if (fast_switch) { wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached)); - else - wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, - READ_ONCE(cpudata->cppc_req_cached)); + return 0; + } + + return wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, + READ_ONCE(cpudata->cppc_req_cached)); } DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); -static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, +static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch) { - static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, - max_perf, fast_switch); + return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, + max_perf, fast_switch); } static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) @@ -480,7 +482,7 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) return static_call(amd_pstate_init_perf)(cpudata); } -static void shmem_update_perf(struct amd_cpudata *cpudata, +static int shmem_update_perf(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch) { @@ -490,7 +492,7 @@ static void shmem_update_perf(struct amd_cpudata *cpudata, perf_ctrls.min_perf = min_perf; perf_ctrls.desired_perf = des_perf; - cppc_set_perf(cpudata->cpu, &perf_ctrls); + return cppc_set_perf(cpudata->cpu, &perf_ctrls); } static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) From 3f7b835fa4d0d06f82249a3aca989fdf9bdf4656 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:43 -0600 Subject: [PATCH 31/70] cpufreq/amd-pstate: Move limit updating code The limit updating code in amd_pstate_epp_update_limit() should not only apply to EPP updates. Move it to amd_pstate_update_min_max_limit() so other callers can benefit as well. With this move it's not necessary to have clamp_t calls anymore because the verify callback is called when setting limits. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-11-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d279ace500d7..55529e32d325 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -537,10 +537,6 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, u32 nominal_perf = READ_ONCE(cpudata->nominal_perf); u64 value = prev; - min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf, - cpudata->max_limit_perf); - max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf, - cpudata->max_limit_perf); des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); max_freq = READ_ONCE(cpudata->max_limit_freq); @@ -607,7 +603,7 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq; + u32 max_limit_perf, min_limit_perf, max_perf, max_freq; struct amd_cpudata *cpudata = policy->driver_data; max_perf = READ_ONCE(cpudata->highest_perf); @@ -615,12 +611,8 @@ static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) max_limit_perf = div_u64(policy->max * max_perf, max_freq); min_limit_perf = div_u64(policy->min * max_perf, max_freq); - lowest_perf = READ_ONCE(cpudata->lowest_perf); - if (min_limit_perf < lowest_perf) - min_limit_perf = lowest_perf; - - if (max_limit_perf < min_limit_perf) - max_limit_perf = min_limit_perf; + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + min_limit_perf = min(cpudata->nominal_perf, max_limit_perf); WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); @@ -1562,28 +1554,18 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - u32 max_perf, min_perf; u64 value; s16 epp; - max_perf = READ_ONCE(cpudata->highest_perf); - min_perf = READ_ONCE(cpudata->lowest_perf); amd_pstate_update_min_max_limit(policy); - max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf, - cpudata->max_limit_perf); - min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf, - cpudata->max_limit_perf); value = READ_ONCE(cpudata->cppc_req_cached); - if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) - min_perf = min(cpudata->nominal_perf, max_perf); - value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | AMD_CPPC_DES_PERF_MASK); - value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); + value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, cpudata->max_limit_perf); value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, 0); - value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, cpudata->min_limit_perf); /* Get BIOS pre-defined epp value */ epp = amd_pstate_get_epp(cpudata, value); From b3781f30bfcfd7db12de2595adb01779e565e1c6 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:44 -0600 Subject: [PATCH 32/70] cpufreq/amd-pstate: Cache EPP value and use that everywhere Cache the value in cpudata->epp_cached, and use that for all callers. As all callers use cached value merge amd_pstate_get_energy_pref_index() into show_energy_performance_preference(). Check if the EPP value is changed before writing it to MSR or shared memory region. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-12-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 105 ++++++++++++++--------------------- 1 file changed, 43 insertions(+), 62 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 55529e32d325..d1f82e4ca9b1 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -186,29 +186,28 @@ static inline int get_mode_idx_from_str(const char *str, size_t size) static DEFINE_MUTEX(amd_pstate_limits_lock); static DEFINE_MUTEX(amd_pstate_driver_lock); -static s16 msr_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) +static s16 msr_get_epp(struct amd_cpudata *cpudata) { + u64 value; int ret; - if (!cppc_req_cached) { - ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req_cached); - if (ret < 0) { - pr_debug("Could not retrieve energy perf value (%d)\n", ret); - return ret; - } + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); + if (ret < 0) { + pr_debug("Could not retrieve energy perf value (%d)\n", ret); + return ret; } - return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cppc_req_cached); + return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, value); } DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp); -static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) +static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata) { - return static_call(amd_pstate_get_epp)(cpudata, cppc_req_cached); + return static_call(amd_pstate_get_epp)(cpudata); } -static s16 shmem_get_epp(struct amd_cpudata *cpudata, u64 dummy) +static s16 shmem_get_epp(struct amd_cpudata *cpudata) { u64 epp; int ret; @@ -222,35 +221,6 @@ static s16 shmem_get_epp(struct amd_cpudata *cpudata, u64 dummy) return (s16)(epp & 0xff); } -static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) -{ - s16 epp; - int index = -EINVAL; - - epp = amd_pstate_get_epp(cpudata, 0); - if (epp < 0) - return epp; - - switch (epp) { - case AMD_CPPC_EPP_PERFORMANCE: - index = EPP_INDEX_PERFORMANCE; - break; - case AMD_CPPC_EPP_BALANCE_PERFORMANCE: - index = EPP_INDEX_BALANCE_PERFORMANCE; - break; - case AMD_CPPC_EPP_BALANCE_POWERSAVE: - index = EPP_INDEX_BALANCE_POWERSAVE; - break; - case AMD_CPPC_EPP_POWERSAVE: - index = EPP_INDEX_POWERSAVE; - break; - default: - break; - } - - return index; -} - static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch) { @@ -275,19 +245,23 @@ static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) { - u64 value = READ_ONCE(cpudata->cppc_req_cached); + u64 value, prev; int ret; + value = prev = READ_ONCE(cpudata->cppc_req_cached); value &= ~AMD_CPPC_EPP_PERF_MASK; value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + if (value == prev) + return 0; + ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); if (ret) { pr_err("failed to set energy perf value (%d)\n", ret); return ret; } - cpudata->epp_cached = epp; + WRITE_ONCE(cpudata->epp_cached, epp); WRITE_ONCE(cpudata->cppc_req_cached, value); return ret; @@ -305,13 +279,16 @@ static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) int ret; struct cppc_perf_ctrls perf_ctrls; + if (epp == cpudata->epp_cached) + return 0; + perf_ctrls.energy_perf = epp; ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); if (ret) { pr_debug("failed to set energy perf value (%d)\n", ret); return ret; } - cpudata->epp_cached = epp; + WRITE_ONCE(cpudata->epp_cached, epp); return ret; } @@ -1214,9 +1191,22 @@ static ssize_t show_energy_performance_preference( struct amd_cpudata *cpudata = policy->driver_data; int preference; - preference = amd_pstate_get_energy_pref_index(cpudata); - if (preference < 0) - return preference; + switch (cpudata->epp_cached) { + case AMD_CPPC_EPP_PERFORMANCE: + preference = EPP_INDEX_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_PERFORMANCE: + preference = EPP_INDEX_BALANCE_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_POWERSAVE: + preference = EPP_INDEX_BALANCE_POWERSAVE; + break; + case AMD_CPPC_EPP_POWERSAVE: + preference = EPP_INDEX_POWERSAVE; + break; + default: + return -EINVAL; + } return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } @@ -1501,7 +1491,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata; - cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata, 0); + cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata); policy->min = policy->cpuinfo.min_freq; policy->max = policy->cpuinfo.max_freq; @@ -1555,35 +1545,26 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; u64 value; - s16 epp; amd_pstate_update_min_max_limit(policy); value = READ_ONCE(cpudata->cppc_req_cached); value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | - AMD_CPPC_DES_PERF_MASK); + AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK); value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, cpudata->max_limit_perf); value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, 0); value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, cpudata->min_limit_perf); - /* Get BIOS pre-defined epp value */ - epp = amd_pstate_get_epp(cpudata, value); - if (epp < 0) { - /** - * This return value can only be negative for shared_memory - * systems where EPP register read/write not supported. - */ - return epp; - } - if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) - epp = 0; + WRITE_ONCE(cpudata->epp_cached, 0); + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, cpudata->epp_cached); WRITE_ONCE(cpudata->cppc_req_cached, value); if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp, + trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, + cpudata->epp_cached, cpudata->min_limit_perf, cpudata->max_limit_perf, policy->boost_enabled); @@ -1592,7 +1573,7 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, cpudata->max_limit_perf, false); - return amd_pstate_set_epp(cpudata, epp); + return amd_pstate_set_epp(cpudata, READ_ONCE(cpudata->epp_cached)); } static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) From fff395796917ac3fe3b4c4607cb74a8dbdc17593 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:45 -0600 Subject: [PATCH 33/70] cpufreq/amd-pstate: Always write EPP value when updating perf For MSR systems the EPP value is in the same register as perf targets and so divding them into two separate MSR writes is wasteful. In msr_update_perf(), update both EPP and perf values in one write to MSR_AMD_CPPC_REQ, and cache them if successful. To accomplish this plumb the EPP value into the update_perf call and modify all its callers to check the return value. As this unifies calls, ensure that the MSR write is necessary before flushing a write out. Also drop the comparison from the passive flow tracing. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-13-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-trace.h | 7 +- drivers/cpufreq/amd-pstate.c | 110 +++++++++++++++-------------- 2 files changed, 57 insertions(+), 60 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h index e2221a4b6901..8d692415d905 100644 --- a/drivers/cpufreq/amd-pstate-trace.h +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -32,7 +32,6 @@ TRACE_EVENT(amd_pstate_perf, u64 aperf, u64 tsc, unsigned int cpu_id, - bool changed, bool fast_switch ), @@ -44,7 +43,6 @@ TRACE_EVENT(amd_pstate_perf, aperf, tsc, cpu_id, - changed, fast_switch ), @@ -57,7 +55,6 @@ TRACE_EVENT(amd_pstate_perf, __field(unsigned long long, aperf) __field(unsigned long long, tsc) __field(unsigned int, cpu_id) - __field(bool, changed) __field(bool, fast_switch) ), @@ -70,11 +67,10 @@ TRACE_EVENT(amd_pstate_perf, __entry->aperf = aperf; __entry->tsc = tsc; __entry->cpu_id = cpu_id; - __entry->changed = changed; __entry->fast_switch = fast_switch; ), - TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u changed=%s fast_switch=%s", + TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s", (unsigned long)__entry->min_perf, (unsigned long)__entry->target_perf, (unsigned long)__entry->capacity, @@ -83,7 +79,6 @@ TRACE_EVENT(amd_pstate_perf, (unsigned long long)__entry->aperf, (unsigned long long)__entry->tsc, (unsigned int)__entry->cpu_id, - (__entry->changed) ? "true" : "false", (__entry->fast_switch) ? "true" : "false" ) ); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d1f82e4ca9b1..419790e52d91 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -222,25 +222,47 @@ static s16 shmem_get_epp(struct amd_cpudata *cpudata) } static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, - u32 des_perf, u32 max_perf, bool fast_switch) + u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) { - if (fast_switch) { - wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached)); + u64 value, prev; + + value = prev = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | + AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK); + value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); + value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf); + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + + if (value == prev) return 0; + + if (fast_switch) { + wrmsrl(MSR_AMD_CPPC_REQ, value); + return 0; + } else { + int ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + + if (ret) + return ret; } - return wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, - READ_ONCE(cpudata->cppc_req_cached)); + WRITE_ONCE(cpudata->cppc_req_cached, value); + WRITE_ONCE(cpudata->epp_cached, epp); + + return 0; } DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, - u32 max_perf, bool fast_switch) + u32 max_perf, u32 epp, + bool fast_switch) { return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, - max_perf, fast_switch); + max_perf, epp, fast_switch); } static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) @@ -261,6 +283,7 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) return ret; } + /* update both so that msr_update_perf() can effectively check */ WRITE_ONCE(cpudata->epp_cached, epp); WRITE_ONCE(cpudata->cppc_req_cached, value); @@ -459,12 +482,18 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) return static_call(amd_pstate_init_perf)(cpudata); } -static int shmem_update_perf(struct amd_cpudata *cpudata, - u32 min_perf, u32 des_perf, - u32 max_perf, bool fast_switch) +static int shmem_update_perf(struct amd_cpudata *cpudata, u32 min_perf, + u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) { struct cppc_perf_ctrls perf_ctrls; + if (cppc_state == AMD_PSTATE_ACTIVE) { + int ret = shmem_set_epp(cpudata, epp); + + if (ret) + return ret; + } + perf_ctrls.max_perf = max_perf; perf_ctrls.min_perf = min_perf; perf_ctrls.desired_perf = des_perf; @@ -510,9 +539,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, { unsigned long max_freq; struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); - u64 prev = READ_ONCE(cpudata->cppc_req_cached); u32 nominal_perf = READ_ONCE(cpudata->nominal_perf); - u64 value = prev; des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); @@ -528,27 +555,14 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, if (!cpudata->boost_supported) max_perf = min_t(unsigned long, nominal_perf, max_perf); - value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | - AMD_CPPC_DES_PERF_MASK); - value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); - value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf); - value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); - if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc, - cpudata->cpu, (value != prev), fast_switch); + cpudata->cpu, fast_switch); } - if (value == prev) - goto cpufreq_policy_put; + amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch); - WRITE_ONCE(cpudata->cppc_req_cached, value); - - amd_pstate_update_perf(cpudata, min_perf, des_perf, - max_perf, fast_switch); - -cpufreq_policy_put: cpufreq_cpu_put(policy); } @@ -1544,36 +1558,24 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - u64 value; + u32 epp; amd_pstate_update_min_max_limit(policy); - value = READ_ONCE(cpudata->cppc_req_cached); - - value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | - AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK); - value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, cpudata->max_limit_perf); - value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, 0); - value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, cpudata->min_limit_perf); - if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) - WRITE_ONCE(cpudata->epp_cached, 0); - value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, cpudata->epp_cached); - - WRITE_ONCE(cpudata->cppc_req_cached, value); + epp = 0; + else + epp = READ_ONCE(cpudata->epp_cached); if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, - cpudata->epp_cached, + trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp, cpudata->min_limit_perf, cpudata->max_limit_perf, policy->boost_enabled); } - amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, - cpudata->max_limit_perf, false); - - return amd_pstate_set_epp(cpudata, READ_ONCE(cpudata->epp_cached)); + return amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, + cpudata->max_limit_perf, epp, false); } static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) @@ -1602,7 +1604,7 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) return 0; } -static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) +static int amd_pstate_epp_reenable(struct amd_cpudata *cpudata) { u64 max_perf; int ret; @@ -1620,17 +1622,19 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) max_perf, cpudata->boost_state); } - amd_pstate_update_perf(cpudata, 0, 0, max_perf, false); - amd_pstate_set_epp(cpudata, cpudata->epp_cached); + return amd_pstate_update_perf(cpudata, 0, 0, max_perf, cpudata->epp_cached, false); } static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; + int ret; pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); - amd_pstate_epp_reenable(cpudata); + ret = amd_pstate_epp_reenable(cpudata); + if (ret) + return ret; cpudata->suspended = false; return 0; @@ -1654,10 +1658,8 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) min_perf, min_perf, policy->boost_enabled); } - amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false); - amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); - - return 0; + return amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, + AMD_CPPC_EPP_BALANCE_POWERSAVE, false); } static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) From f8fde687c911a366a6132aed85f4ee6b647b9160 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:46 -0600 Subject: [PATCH 34/70] cpufreq/amd-pstate: Drop ret variable from amd_pstate_set_energy_pref_index() The ret variable is not necessary. Reviewed-and-tested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-14-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 419790e52d91..4d665d9c76d3 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -319,13 +319,11 @@ static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, int pref_index) { - int epp = -EINVAL; - int ret; + int epp; if (!pref_index) epp = cpudata->epp_default; - - if (epp == -EINVAL) + else epp = epp_values[pref_index]; if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { @@ -341,9 +339,7 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, cpudata->boost_state); } - ret = amd_pstate_set_epp(cpudata, epp); - - return ret; + return amd_pstate_set_epp(cpudata, epp); } static inline int msr_cppc_enable(bool enable) From f9a378ff6443cdcd4387e5dbb76fa5fa549a83ec Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:47 -0600 Subject: [PATCH 35/70] cpufreq/amd-pstate: Set different default EPP policy for Epyc and Ryzen For Ryzen systems the EPP policy set by the BIOS is generally configured to performance as this is the default register value for the CPPC request MSR. If a user doesn't use additional software to configure EPP then the system will default biased towards performance and consume extra battery. Instead configure the default to "balanced_performance" for this case. Suggested-by: Artem S. Tashkinov Reviewed-by: Dhananjay Ugwekar Tested-by: Dhananjay Ugwekar Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219526 Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-15-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 4d665d9c76d3..97aee213821d 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1501,8 +1501,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata; - cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata); - policy->min = policy->cpuinfo.min_freq; policy->max = policy->cpuinfo.max_freq; @@ -1513,10 +1511,13 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) * the default cpufreq governor is neither powersave nor performance. */ if (amd_pstate_acpi_pm_profile_server() || - amd_pstate_acpi_pm_profile_undefined()) + amd_pstate_acpi_pm_profile_undefined()) { policy->policy = CPUFREQ_POLICY_PERFORMANCE; - else + cpudata->epp_default = amd_pstate_get_epp(cpudata); + } else { policy->policy = CPUFREQ_POLICY_POWERSAVE; + cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE; + } if (cpu_feature_enabled(X86_FEATURE_CPPC)) { ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); @@ -1529,6 +1530,9 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) return ret; WRITE_ONCE(cpudata->cppc_cap1_cached, value); } + ret = amd_pstate_set_epp(cpudata, cpudata->epp_default); + if (ret) + return ret; current_pstate_driver->adjust_perf = NULL; From 95fad7fb58cfaa2a295aa54a1f001a16b9324963 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:48 -0600 Subject: [PATCH 36/70] cpufreq/amd-pstate: Drop boost_state variable Currently boost_state is cached for every processor in cpudata structure and driver boost state is set for every processor. Both of these aren't necessary as the driver only needs to set once and the policy stores whether boost is enabled. Move the driver boost setting to registration and adjust all references to cached value to pull from the policy instead. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20241209185248.16301-16-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 26 +++++++++++++------------- drivers/cpufreq/amd-pstate.h | 1 - 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 97aee213821d..d7b1de97727a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -316,9 +316,10 @@ static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) return ret; } -static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, - int pref_index) +static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, + int pref_index) { + struct amd_cpudata *cpudata = policy->driver_data; int epp; if (!pref_index) @@ -336,7 +337,7 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, epp, FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), - cpudata->boost_state); + policy->boost_enabled); } return amd_pstate_set_epp(cpudata, epp); @@ -746,7 +747,6 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) guard(mutex)(&amd_pstate_driver_lock); ret = amd_pstate_cpu_boost_update(policy, state); - WRITE_ONCE(cpudata->boost_state, !ret ? state : false); policy->boost_enabled = !ret ? state : false; refresh_frequency_limits(policy); @@ -768,9 +768,6 @@ static int amd_pstate_init_boost_support(struct amd_cpudata *cpudata) goto exit_err; } - /* at least one CPU supports CPB, even if others fail later on to set up */ - current_pstate_driver->boost_enabled = true; - ret = rdmsrl_on_cpu(cpudata->cpu, MSR_K7_HWCR, &boost_val); if (ret) { pr_err_once("failed to read initial CPU boost state!\n"); @@ -1176,7 +1173,6 @@ static ssize_t show_energy_performance_available_preferences( static ssize_t store_energy_performance_preference( struct cpufreq_policy *policy, const char *buf, size_t count) { - struct amd_cpudata *cpudata = policy->driver_data; char str_preference[21]; ssize_t ret; @@ -1190,7 +1186,7 @@ static ssize_t store_energy_performance_preference( guard(mutex)(&amd_pstate_limits_lock); - ret = amd_pstate_set_energy_pref_index(cpudata, ret); + ret = amd_pstate_set_energy_pref_index(policy, ret); return ret ? ret : count; } @@ -1265,6 +1261,9 @@ static int amd_pstate_register_driver(int mode) return ret; } + /* at least one CPU supports CPB */ + current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB); + ret = cpufreq_register_driver(current_pstate_driver); if (ret) { amd_pstate_driver_cleanup(); @@ -1604,8 +1603,9 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) return 0; } -static int amd_pstate_epp_reenable(struct amd_cpudata *cpudata) +static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) { + struct amd_cpudata *cpudata = policy->driver_data; u64 max_perf; int ret; @@ -1619,7 +1619,7 @@ static int amd_pstate_epp_reenable(struct amd_cpudata *cpudata) trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, cpudata->epp_cached, FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), - max_perf, cpudata->boost_state); + max_perf, policy->boost_enabled); } return amd_pstate_update_perf(cpudata, 0, 0, max_perf, cpudata->epp_cached, false); @@ -1632,7 +1632,7 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); - ret = amd_pstate_epp_reenable(cpudata); + ret = amd_pstate_epp_reenable(policy); if (ret) return ret; cpudata->suspended = false; @@ -1690,7 +1690,7 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) guard(mutex)(&amd_pstate_limits_lock); /* enable amd pstate from suspend state*/ - amd_pstate_epp_reenable(cpudata); + amd_pstate_epp_reenable(policy); cpudata->suspended = false; } diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 7765c82f975c..9747e3be6cee 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -98,7 +98,6 @@ struct amd_cpudata { u64 cppc_cap1_cached; bool suspended; s16 epp_default; - bool boost_state; }; /* From c759bc8e9046f9812238f506d70f07d3ea4206d4 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 11 Dec 2024 12:28:12 +0900 Subject: [PATCH 37/70] ACPI: fan: cleanup resources in the error path of .probe() Call thermal_cooling_device_unregister() and sysfs_remove_link() in the error path of acpi_fan_probe() to fix possible memory leak. This bug was found by an experimental static analysis tool that I am developing. Fixes: 05a83d972293 ("ACPI: register ACPI Fan as generic thermal cooling device") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241211032812.210164-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Rafael J. Wysocki --- drivers/acpi/fan_core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/fan_core.c b/drivers/acpi/fan_core.c index 3ea9cfcff46e..10016f52f4f4 100644 --- a/drivers/acpi/fan_core.c +++ b/drivers/acpi/fan_core.c @@ -371,19 +371,25 @@ static int acpi_fan_probe(struct platform_device *pdev) result = sysfs_create_link(&pdev->dev.kobj, &cdev->device.kobj, "thermal_cooling"); - if (result) + if (result) { dev_err(&pdev->dev, "Failed to create sysfs link 'thermal_cooling'\n"); + goto err_unregister; + } result = sysfs_create_link(&cdev->device.kobj, &pdev->dev.kobj, "device"); if (result) { dev_err(&pdev->dev, "Failed to create sysfs link 'device'\n"); - goto err_end; + goto err_remove_link; } return 0; +err_remove_link: + sysfs_remove_link(&pdev->dev.kobj, "thermal_cooling"); +err_unregister: + thermal_cooling_device_unregister(cdev); err_end: if (fan->acpi4) acpi_fan_delete_attributes(device); From 9d6c0e58514f8b57cd9c2c755e41623d6a966025 Mon Sep 17 00:00:00 2001 From: He Rongguang Date: Thu, 12 Dec 2024 10:14:59 +0800 Subject: [PATCH 38/70] cpupower: fix TSC MHz calculation Commit 'cpupower: Make TSC read per CPU for Mperf monitor' (c2adb1877b7) changes TSC counter reads per cpu, but left time diff global (from start of all cpus to end of all cpus), thus diff(time) is too large for a cpu's tsc counting, resulting in far less than acutal TSC_Mhz and thus `cpupower monitor` showing far less than actual cpu realtime frequency. /proc/cpuinfo shows frequency: cat /proc/cpuinfo | egrep -e 'processor' -e 'MHz' ... processor : 171 cpu MHz : 4108.498 ... before fix (System 100% busy): | Mperf || Idle_Stats CPU| C0 | Cx | Freq || POLL | C1 | C2 171| 0.77| 99.23| 2279|| 0.00| 0.00| 0.00 after fix (System 100% busy): | Mperf || Idle_Stats CPU| C0 | Cx | Freq || POLL | C1 | C2 171| 0.46| 99.54| 4095|| 0.00| 0.00| 0.00 Fixes: c2adb1877b76 ("cpupower: Make TSC read per CPU for Mperf monitor") Signed-off-by: He Rongguang Signed-off-by: Shuah Khan --- .../cpupower/utils/idle_monitor/mperf_monitor.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c index 0a03573ebcc2..73b6b10cbdd2 100644 --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c @@ -33,7 +33,7 @@ static int mperf_get_count_percent(unsigned int self_id, double *percent, unsigned int cpu); static int mperf_get_count_freq(unsigned int id, unsigned long long *count, unsigned int cpu); -static struct timespec time_start, time_end; +static struct timespec *time_start, *time_end; static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { { @@ -174,7 +174,7 @@ static int mperf_get_count_percent(unsigned int id, double *percent, dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", mperf_cstates[id].name, mperf_diff, tsc_diff); } else if (max_freq_mode == MAX_FREQ_SYSFS) { - timediff = max_frequency * timespec_diff_us(time_start, time_end); + timediff = max_frequency * timespec_diff_us(time_start[cpu], time_end[cpu]); *percent = 100.0 * mperf_diff / timediff; dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", mperf_cstates[id].name, mperf_diff, timediff); @@ -207,7 +207,7 @@ static int mperf_get_count_freq(unsigned int id, unsigned long long *count, if (max_freq_mode == MAX_FREQ_TSC_REF) { /* Calculate max_freq from TSC count */ tsc_diff = tsc_at_measure_end[cpu] - tsc_at_measure_start[cpu]; - time_diff = timespec_diff_us(time_start, time_end); + time_diff = timespec_diff_us(time_start[cpu], time_end[cpu]); max_frequency = tsc_diff / time_diff; } @@ -226,9 +226,8 @@ static int mperf_start(void) { int cpu; - clock_gettime(CLOCK_REALTIME, &time_start); - for (cpu = 0; cpu < cpu_count; cpu++) { + clock_gettime(CLOCK_REALTIME, &time_start[cpu]); mperf_get_tsc(&tsc_at_measure_start[cpu]); mperf_init_stats(cpu); } @@ -243,9 +242,9 @@ static int mperf_stop(void) for (cpu = 0; cpu < cpu_count; cpu++) { mperf_measure_stats(cpu); mperf_get_tsc(&tsc_at_measure_end[cpu]); + clock_gettime(CLOCK_REALTIME, &time_end[cpu]); } - clock_gettime(CLOCK_REALTIME, &time_end); return 0; } @@ -349,6 +348,8 @@ struct cpuidle_monitor *mperf_register(void) aperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); tsc_at_measure_start = calloc(cpu_count, sizeof(unsigned long long)); tsc_at_measure_end = calloc(cpu_count, sizeof(unsigned long long)); + time_start = calloc(cpu_count, sizeof(struct timespec)); + time_end = calloc(cpu_count, sizeof(struct timespec)); mperf_monitor.name_len = strlen(mperf_monitor.name); return &mperf_monitor; } @@ -361,6 +362,8 @@ void mperf_unregister(void) free(aperf_current_count); free(tsc_at_measure_start); free(tsc_at_measure_end); + free(time_start); + free(time_end); free(is_valid); } From de51589f9bd98efddf4ab776d3a490e81905ef7c Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Wed, 11 Dec 2024 12:26:05 +0000 Subject: [PATCH 39/70] cpufreq: intel_pstate: Use CPUFREQ_POLICY_UNKNOWN epp_policy uses the same values as cpufreq_policy.policy and resets to CPUFREQ_POLICY_UNKNOWN during offlining. Be consistent about it and initialize to CPUFREQ_POLICY_UNKNOWN instead of 0, too. No functional change intended. Signed-off-by: Christian Loehle Link: https://patch.msgid.link/20241211122605.3048503-3-christian.loehle@arm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9e14374498d6..9c4cc01fd51a 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2713,7 +2713,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum) } cpu->epp_powersave = -EINVAL; - cpu->epp_policy = 0; + cpu->epp_policy = CPUFREQ_POLICY_UNKNOWN; intel_pstate_get_cpu_pstates(cpu); From 8e461a1cb43d69d2fc8a97e61916dce571e6bb31 Mon Sep 17 00:00:00 2001 From: "Sultan Alsawaf (unemployed)" Date: Wed, 11 Dec 2024 17:57:32 -0800 Subject: [PATCH 40/70] cpufreq: schedutil: Fix superfluous updates caused by need_freq_update A redundant frequency update is only truly needed when there is a policy limits change with a driver that specifies CPUFREQ_NEED_UPDATE_LIMITS. In spite of that, drivers specifying CPUFREQ_NEED_UPDATE_LIMITS receive a frequency update _all the time_, not just for a policy limits change, because need_freq_update is never cleared. Furthermore, ignore_dl_rate_limit()'s usage of need_freq_update also leads to a redundant frequency update, regardless of whether or not the driver specifies CPUFREQ_NEED_UPDATE_LIMITS, when the next chosen frequency is the same as the current one. Fix the superfluous updates by only honoring CPUFREQ_NEED_UPDATE_LIMITS when there's a policy limits change, and clearing need_freq_update when a requisite redundant update occurs. This is neatly achieved by moving up the CPUFREQ_NEED_UPDATE_LIMITS test and instead setting need_freq_update to false in sugov_update_next_freq(). Fixes: 600f5badb78c ("cpufreq: schedutil: Don't skip freq update when limits change") Signed-off-by: Sultan Alsawaf (unemployed) Reviewed-by: Christian Loehle Link: https://patch.msgid.link/20241212015734.41241-2-sultan@kerneltoast.com Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28c77904ea74..e51d5ce730be 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -83,7 +83,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (unlikely(sg_policy->limits_changed)) { sg_policy->limits_changed = false; - sg_policy->need_freq_update = true; + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); return true; } @@ -96,7 +96,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { if (sg_policy->need_freq_update) - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + sg_policy->need_freq_update = false; else if (sg_policy->next_freq == next_freq) return false; From ebeeee390b6a341770789a50d81e677da9a103d9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Dec 2024 13:01:02 +0100 Subject: [PATCH 41/70] PM: EM: Move sched domains rebuild function from schedutil to EM Function sugov_eas_rebuild_sd() defined in the schedutil cpufreq governor implements generic functionality that may be useful in other places. In particular, there is a plan to use it in the intel_pstate driver in the future. For this reason, move it from schedutil to the energy model code and rename it to em_rebuild_sched_domains(). This also helps to get rid of some #ifdeffery in schedutil which is a plus. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle --- drivers/cpufreq/cpufreq.c | 2 +- include/linux/energy_model.h | 2 ++ kernel/power/energy_model.c | 17 ++++++++++++++++ kernel/sched/cpufreq_schedutil.c | 33 ++++++-------------------------- 4 files changed, 26 insertions(+), 28 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1a4cae54a01b..418236fef172 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1538,7 +1538,7 @@ static int cpufreq_online(unsigned int cpu) /* * Register with the energy model before - * sugov_eas_rebuild_sd() is called, which will result + * em_rebuild_sched_domains() is called, which will result * in rebuilding of the sched domains, which should only be done * once the energy model is properly initialized for the policy * first. diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 752e0b297582..78318d49276d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -179,6 +179,7 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int em_dev_update_chip_binning(struct device *dev); int em_update_performance_limits(struct em_perf_domain *pd, unsigned long freq_min_khz, unsigned long freq_max_khz); +void em_rebuild_sched_domains(void); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -404,6 +405,7 @@ int em_update_performance_limits(struct em_perf_domain *pd, { return -EINVAL; } +static inline void em_rebuild_sched_domains(void) {} #endif #endif diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d07faf42eace..3874f0e97651 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -908,3 +908,20 @@ int em_update_performance_limits(struct em_perf_domain *pd, return 0; } EXPORT_SYMBOL_GPL(em_update_performance_limits); + +static void rebuild_sd_workfn(struct work_struct *work) +{ + rebuild_sched_domains_energy(); +} + +void em_rebuild_sched_domains(void) +{ + static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); +} diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28c77904ea74..1c07a2f8c8b5 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -604,31 +604,6 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -#ifdef CONFIG_ENERGY_MODEL -static void rebuild_sd_workfn(struct work_struct *work) -{ - rebuild_sched_domains_energy(); -} - -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -static void sugov_eas_rebuild_sd(void) -{ - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); -} -#else -static inline void sugov_eas_rebuild_sd(void) { }; -#endif - struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) @@ -784,7 +759,11 @@ static int sugov_init(struct cpufreq_policy *policy) goto fail; out: - sugov_eas_rebuild_sd(); + /* + * Schedutil is the preferred governor for EAS, so rebuild sched domains + * on governor changes to make sure the scheduler knows about them. + */ + em_rebuild_sched_domains(); mutex_unlock(&global_tunables_lock); return 0; @@ -826,7 +805,7 @@ static void sugov_exit(struct cpufreq_policy *policy) sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); - sugov_eas_rebuild_sd(); + em_rebuild_sched_domains(); } static int sugov_start(struct cpufreq_policy *policy) From 4596cbea0ed2ef4f563a92775c9f612700ece145 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 16 Dec 2024 11:08:21 -0800 Subject: [PATCH 42/70] thermal: intel: Remove explicit user_space governor selection Currently some user space programs like Linux thermald needs to register to get notifications from both thermal user space governor and also Thermal netlink. This is required as some messages like HFI (Hardware Feedback Notifications) requires Thermal netlink. This results in additional processing in kernel and user space to process both notifications. The cost of using user space governor using kobject_uevent is much higher as this is also used by other user space daemons like udev daemon. Do not select user_space thermal governor by default. If it is present user space programs can always use this governor by writing to "policy" attribute. Instead from the kernel select THERMAL_NETLINK. Trip temperature violation can be received by user space programs via thermal netlink events: THERMAL_GENL_EVENT_TZ_TRIP_UP THERMAL_GENL_EVENT_TZ_TRIP_DOWN Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20241216190821.1137162-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/Kconfig | 2 +- drivers/thermal/intel/int340x_thermal/Kconfig | 2 +- drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 1 - drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c | 1 - .../intel/int340x_thermal/processor_thermal_device_pci.c | 1 - 5 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index a31f2f32996a..9c0f66f9defc 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -22,7 +22,7 @@ config INTEL_TCC config X86_PKG_TEMP_THERMAL tristate "X86 package temperature thermal driver" depends on X86_THERMAL_VECTOR - select THERMAL_GOV_USER_SPACE + select THERMAL_NETLINK select INTEL_TCC default m help diff --git a/drivers/thermal/intel/int340x_thermal/Kconfig b/drivers/thermal/intel/int340x_thermal/Kconfig index e76b13e44d03..d9a74424c29d 100644 --- a/drivers/thermal/intel/int340x_thermal/Kconfig +++ b/drivers/thermal/intel/int340x_thermal/Kconfig @@ -6,7 +6,7 @@ config INT340X_THERMAL tristate "ACPI INT340X thermal drivers" depends on X86_64 && ACPI && PCI - select THERMAL_GOV_USER_SPACE + select THERMAL_NETLINK select ACPI_THERMAL_REL select ACPI_FAN select ACPI_THERMAL_LIB diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 5805e08d71be..0e07693ecf59 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -521,7 +521,6 @@ static struct thermal_zone_device_ops int3400_thermal_ops = { }; static struct thermal_zone_params int3400_thermal_params = { - .governor_name = "user_space", .no_hwmon = true, }; diff --git a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c index 31ed338eb83c..8dca6a6aceca 100644 --- a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c +++ b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c @@ -105,7 +105,6 @@ static int int340x_thermal_read_trips(struct acpi_device *zone_adev, } static struct thermal_zone_params int340x_thermal_params = { - .governor_name = "user_space", .no_hwmon = true, }; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 707bd1b23823..7e37fd23b23c 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -272,7 +272,6 @@ static const struct thermal_zone_device_ops tzone_ops = { }; static struct thermal_zone_params tzone_params = { - .governor_name = "user_space", .no_hwmon = true, }; From e50eeababa946d92f1669fd4d66a5f652b143b05 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 16 Dec 2024 13:18:09 -0800 Subject: [PATCH 43/70] thermal: intel: int340x: Panther Lake DLVR support Panther Lake follows same register set as Lunar Lake for DLVR. Enable feature flag to support DLVR. Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/thermal: intel: int340x: Panther Lake DLVR support Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/processor_thermal_device_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 7e37fd23b23c..4fdbf4155459 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -494,7 +494,8 @@ static const struct pci_device_id proc_thermal_pci_ids[] = { PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_DLVR | PROC_THERMAL_FEATURE_WT_HINT) }, { PCI_DEVICE_DATA(INTEL, RPL_THERMAL, PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_FIVR | PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_WT_REQ) }, - { PCI_DEVICE_DATA(INTEL, PTL_THERMAL, PROC_THERMAL_FEATURE_RAPL) }, + { PCI_DEVICE_DATA(INTEL, PTL_THERMAL, PROC_THERMAL_FEATURE_RAPL | + PROC_THERMAL_FEATURE_DLVR) }, { }, }; From b59bd75a4b0983d6bcd10987b32ea9b9042db88c Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 16 Dec 2024 13:18:10 -0800 Subject: [PATCH 44/70] thermal: intel: int340x: Panther Lake power floor and workload hint support Panther Lake follows same register set as Lunar Lake. Enable feature flags to support workload hints and power floor status. Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20241216211810.1207028-2-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/processor_thermal_device_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 4fdbf4155459..a55aaa8cef42 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -495,7 +495,8 @@ static const struct pci_device_id proc_thermal_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, RPL_THERMAL, PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_FIVR | PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_WT_REQ) }, { PCI_DEVICE_DATA(INTEL, PTL_THERMAL, PROC_THERMAL_FEATURE_RAPL | - PROC_THERMAL_FEATURE_DLVR) }, + PROC_THERMAL_FEATURE_DLVR | PROC_THERMAL_FEATURE_MSI_SUPPORT | + PROC_THERMAL_FEATURE_WT_HINT | PROC_THERMAL_FEATURE_POWER_FLOOR) }, { }, }; From b317268368546d6401af788648668f82e3ba1bd3 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 18 Dec 2024 13:09:35 +0900 Subject: [PATCH 45/70] PM: wakeup: implement devm_device_init_wakeup() helper Some drivers that enable device wakeup fail to properly disable it during their cleanup, which results in a memory leak. To address this, introduce devm_device_init_wakeup(), a managed variant of device_init_wakeup(dev, true). With this managed helper, wakeup functionality will be automatically disabled when the device is released, ensuring a more reliable cleanup process. This need for this addition arose during a previous discussion [1]. Link: https://lore.kernel.org/linux-rtc/20241212100403.3799667-1-joe@pf.is.s.u-tokyo.ac.jp/ [1] Suggested-by: Alexandre Belloni Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241218040935.1921416-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 222f7530806c..d501c09c60cd 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -240,4 +240,21 @@ static inline int device_init_wakeup(struct device *dev, bool enable) return 0; } +static void device_disable_wakeup(void *dev) +{ + device_init_wakeup(dev, false); +} + +/** + * devm_device_init_wakeup - Resource managed device wakeup initialization. + * @dev: Device to handle. + * + * This function is the devm managed version of device_init_wakeup(dev, true). + */ +static inline int devm_device_init_wakeup(struct device *dev) +{ + device_init_wakeup(dev, true); + return devm_add_action_or_reset(dev, device_disable_wakeup, dev); +} + #endif /* _LINUX_PM_WAKEUP_H */ From 207a792d01603faae08d9bd82846bd99c42c30cc Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 18 Dec 2024 13:44:44 -0800 Subject: [PATCH 46/70] thermal: intel: Fix compile issue when CONFIG_NET is not defined If CONFIG_NET is not defined then THERMAL_NETLINK can't be selected. Hence add dependency on CONFIG_NET. Othewise it will generate compile errors while compiling thermal_netlink.c. Fixes: 4596cbea0ed2 ("thermal: intel: Remove explicit user_space governor selection") Reported-by: kernel test robot Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20241218214444.1904650-1-srinivas.pandruvada@linux.intel.com [ rjw: Merge the "depends on" lines ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/Kconfig | 2 +- drivers/thermal/intel/int340x_thermal/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index 9c0f66f9defc..e0268fac7093 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -21,7 +21,7 @@ config INTEL_TCC config X86_PKG_TEMP_THERMAL tristate "X86 package temperature thermal driver" - depends on X86_THERMAL_VECTOR + depends on X86_THERMAL_VECTOR && NET select THERMAL_NETLINK select INTEL_TCC default m diff --git a/drivers/thermal/intel/int340x_thermal/Kconfig b/drivers/thermal/intel/int340x_thermal/Kconfig index d9a74424c29d..4c699f0896b5 100644 --- a/drivers/thermal/intel/int340x_thermal/Kconfig +++ b/drivers/thermal/intel/int340x_thermal/Kconfig @@ -5,7 +5,7 @@ config INT340X_THERMAL tristate "ACPI INT340X thermal drivers" - depends on X86_64 && ACPI && PCI + depends on X86_64 && ACPI && PCI && NET select THERMAL_NETLINK select ACPI_THERMAL_REL select ACPI_FAN From 79f237bae910e1019f1f7617d7b0b900f717d209 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:15 +0000 Subject: [PATCH 47/70] ACPI: bus: change the prototype for acpi_get_physical_device_location It generally is not OK to use acpi_status and/or AE_ error codes without CONFIG_ACPI and they really only should be used in drivers/acpi/ (and not everywhere in there for that matter). So acpi_get_physical_device_location() needs to be redefined to return something different from acpi_status (preferably bool) in order to be used in !CONFIG_ACPI code. Suggested-by: Rafael J. Wysocki Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-1-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- drivers/acpi/mipi-disco-img.c | 3 +-- drivers/acpi/scan.c | 4 +--- drivers/acpi/utils.c | 7 +++---- drivers/base/physical_location.c | 4 +--- drivers/media/pci/intel/ipu-bridge.c | 4 ++-- drivers/usb/core/usb-acpi.c | 3 +-- include/acpi/acpi_bus.h | 2 +- 7 files changed, 10 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index 92b658f92dc0..5b85989f96be 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -624,8 +624,7 @@ static void init_crs_csi2_swnodes(struct crs_csi2 *csi2) if (!fwnode_property_present(adev_fwnode, "rotation")) { struct acpi_pld_info *pld; - status = acpi_get_physical_device_location(handle, &pld); - if (ACPI_SUCCESS(status)) { + if (acpi_get_physical_device_location(handle, &pld)) { swnodes->dev_props[NEXT_PROPERTY(prop_index, DEV_ROTATION)] = PROPERTY_ENTRY_U32("rotation", pld->rotation * 45U); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 74dcccdc6482..93d340027b7f 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -723,10 +723,8 @@ int acpi_tie_acpi_dev(struct acpi_device *adev) static void acpi_store_pld_crc(struct acpi_device *adev) { struct acpi_pld_info *pld; - acpi_status status; - status = acpi_get_physical_device_location(adev->handle, &pld); - if (ACPI_FAILURE(status)) + if (!acpi_get_physical_device_location(adev->handle, &pld)) return; adev->pld_crc = crc32(~0, pld, sizeof(*pld)); diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 6de542d99518..526563a0d188 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -494,7 +494,7 @@ bool acpi_device_dep(acpi_handle target, acpi_handle match) } EXPORT_SYMBOL_GPL(acpi_device_dep); -acpi_status +bool acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld) { acpi_status status; @@ -502,9 +502,8 @@ acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld union acpi_object *output; status = acpi_evaluate_object(handle, "_PLD", NULL, &buffer); - if (ACPI_FAILURE(status)) - return status; + return false; output = buffer.pointer; @@ -523,7 +522,7 @@ acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld out: kfree(buffer.pointer); - return status; + return ACPI_SUCCESS(status); } EXPORT_SYMBOL(acpi_get_physical_device_location); diff --git a/drivers/base/physical_location.c b/drivers/base/physical_location.c index 951819e71b4a..5db06e825c94 100644 --- a/drivers/base/physical_location.c +++ b/drivers/base/physical_location.c @@ -13,13 +13,11 @@ bool dev_add_physical_location(struct device *dev) { struct acpi_pld_info *pld; - acpi_status status; if (!has_acpi_companion(dev)) return false; - status = acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld); - if (ACPI_FAILURE(status)) + if (!acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld)) return false; dev->physical_location = diff --git a/drivers/media/pci/intel/ipu-bridge.c b/drivers/media/pci/intel/ipu-bridge.c index 1bf249f446a9..49aa7c51791e 100644 --- a/drivers/media/pci/intel/ipu-bridge.c +++ b/drivers/media/pci/intel/ipu-bridge.c @@ -259,12 +259,12 @@ static enum v4l2_fwnode_orientation ipu_bridge_parse_orientation(struct acpi_dev { enum v4l2_fwnode_orientation orientation; struct acpi_pld_info *pld = NULL; - acpi_status status = AE_ERROR; + bool status = false; #if IS_ENABLED(CONFIG_ACPI) status = acpi_get_physical_device_location(adev->handle, &pld); #endif - if (ACPI_FAILURE(status)) { + if (!status) { dev_warn(ADEV_DEV(adev), "_PLD call failed, using default orientation\n"); return V4L2_FWNODE_ORIENTATION_EXTERNAL; } diff --git a/drivers/usb/core/usb-acpi.c b/drivers/usb/core/usb-acpi.c index 03c22114214b..935c0efea0b6 100644 --- a/drivers/usb/core/usb-acpi.c +++ b/drivers/usb/core/usb-acpi.c @@ -213,8 +213,7 @@ usb_acpi_get_connect_type(struct usb_port *port_dev, acpi_handle *handle) * no connectable, the port would be not used. */ - status = acpi_get_physical_device_location(handle, &pld); - if (ACPI_SUCCESS(status) && pld) + if (acpi_get_physical_device_location(handle, &pld) && pld) port_dev->location = USB_ACPI_LOCATION_VALID | pld->group_token << 8 | pld->group_position; diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index b2e377b7f337..19f92852e127 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -43,7 +43,7 @@ acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); -acpi_status +bool acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); bool acpi_has_method(acpi_handle handle, char *name); From e04efd34d9fe36e934e86e61f568c427a999aa5c Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:16 +0000 Subject: [PATCH 48/70] ACPI: bus: implement for_each_acpi_dev_match when !ACPI Provide an implementation of for_each_acpi_dev_match that can be used when CONFIG_ACPI is not set. The condition `false && hid && uid && hrv` is used to avoid "variable not used" warnings. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-2-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 19f92852e127..a9b5a5204781 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -1003,6 +1003,9 @@ static inline int unregister_acpi_bus_type(void *bus) { return 0; } static inline int acpi_wait_for_acpi_ipmi(void) { return 0; } +#define for_each_acpi_dev_match(adev, hid, uid, hrv) \ + for (adev = NULL; false && (hid) && (uid) && (hrv); ) + #endif /* CONFIG_ACPI */ #endif /*__ACPI_BUS_H__*/ From 34df77363dff6df1e95ee4c1e035de464da4ee4b Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:17 +0000 Subject: [PATCH 49/70] ACPI: bus: implement acpi_get_physical_device_location when !ACPI Provide an implementation of acpi_get_physical_device_location that can be used when CONFIG_ACPI is not set. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-3-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index a9b5a5204781..f38e8b1c8e1f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -43,9 +43,6 @@ acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); -bool -acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); - bool acpi_has_method(acpi_handle handle, char *name); acpi_status acpi_execute_simple_method(acpi_handle handle, char *method, u64 arg); @@ -60,6 +57,9 @@ bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs); union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4); #ifdef CONFIG_ACPI +bool +acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); + static inline union acpi_object * acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, @@ -1003,6 +1003,12 @@ static inline int unregister_acpi_bus_type(void *bus) { return 0; } static inline int acpi_wait_for_acpi_ipmi(void) { return 0; } +static inline bool +acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld) +{ + return false; +} + #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = NULL; false && (hid) && (uid) && (hrv); ) From 7d3707bbbbb1dc8b1802886e9ff0506b7d8b323b Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:18 +0000 Subject: [PATCH 50/70] ACPI: header: implement acpi_device_handle when !ACPI Provide an implementation of acpi_device_handle that can be used when CONFIG_ACPI is not set. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-4-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6adcd1b92b20..4e495b29c640 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -854,6 +854,11 @@ static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) return NULL; } +static inline acpi_handle acpi_device_handle(struct acpi_device *adev) +{ + return NULL; +} + static inline bool has_acpi_companion(struct device *dev) { return false; From 46d10e5f9c346e6f3aa828895ed8b49cc6d9c0f6 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:19 +0000 Subject: [PATCH 51/70] ACPI: bus: implement for_each_acpi_consumer_dev when !ACPI Provide an implementation of for_each_acpi_consumer_dev that can be use used when CONFIG_ACPI is not set. The expression `false && supplier` is used to avoid "variable not used" warnings. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-5-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index f38e8b1c8e1f..68c0e2fb029f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -1009,6 +1009,9 @@ acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld return false; } +#define for_each_acpi_consumer_dev(supplier, consumer) \ + for (consumer = NULL; false && (supplier);) + #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = NULL; false && (hid) && (uid) && (hrv); ) From 78c3227c5e268d40d0a08fea8b244afeb182610c Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:20 +0000 Subject: [PATCH 52/70] ACPI: bus: implement acpi_device_hid when !ACPI Provide an implementation of acpi_device_hid that can be used when CONFIG_ACPI is not set. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-6-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 68c0e2fb029f..aad1a95e6863 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -1003,6 +1003,11 @@ static inline int unregister_acpi_bus_type(void *bus) { return 0; } static inline int acpi_wait_for_acpi_ipmi(void) { return 0; } +static inline const char *acpi_device_hid(struct acpi_device *device) +{ + return ""; +} + static inline bool acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld) { From 84524b1dadd8c4dd772d22b7effe45e7e65c27f5 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:21 +0000 Subject: [PATCH 53/70] media: ipu-bridge: Remove unneeded conditional compilations The ACPI headers have introduced implementations for some of their functions when the kernel is not configured with ACPI. Let's use them instead of our conditional compilation. It is easier to maintain and less prone to errors. Reviewed-by: Mauro Carvalho Chehab Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-7-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- drivers/media/pci/intel/ipu-bridge.c | 29 ++++------------------------ 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/drivers/media/pci/intel/ipu-bridge.c b/drivers/media/pci/intel/ipu-bridge.c index 49aa7c51791e..1cb745855600 100644 --- a/drivers/media/pci/intel/ipu-bridge.c +++ b/drivers/media/pci/intel/ipu-bridge.c @@ -2,6 +2,7 @@ /* Author: Dan Scally */ #include +#include #include #include #include @@ -107,7 +108,6 @@ static const char * const ipu_vcm_types[] = { "lc898212axb", }; -#if IS_ENABLED(CONFIG_ACPI) /* * Used to figure out IVSC acpi device by ipu_bridge_get_ivsc_acpi_dev() * instead of device and driver match to probe IVSC device. @@ -127,11 +127,11 @@ static struct acpi_device *ipu_bridge_get_ivsc_acpi_dev(struct acpi_device *adev const struct acpi_device_id *acpi_id = &ivsc_acpi_ids[i]; struct acpi_device *consumer, *ivsc_adev; - acpi_handle handle = acpi_device_handle(adev); + acpi_handle handle = acpi_device_handle(ACPI_PTR(adev)); for_each_acpi_dev_match(ivsc_adev, acpi_id->id, NULL, -1) /* camera sensor depends on IVSC in DSDT if exist */ for_each_acpi_consumer_dev(ivsc_adev, consumer) - if (consumer->handle == handle) { + if (ACPI_PTR(consumer->handle) == handle) { acpi_dev_put(consumer); return ivsc_adev; } @@ -139,12 +139,6 @@ static struct acpi_device *ipu_bridge_get_ivsc_acpi_dev(struct acpi_device *adev return NULL; } -#else -static struct acpi_device *ipu_bridge_get_ivsc_acpi_dev(struct acpi_device *adev) -{ - return NULL; -} -#endif static int ipu_bridge_match_ivsc_dev(struct device *dev, const void *adev) { @@ -259,12 +253,8 @@ static enum v4l2_fwnode_orientation ipu_bridge_parse_orientation(struct acpi_dev { enum v4l2_fwnode_orientation orientation; struct acpi_pld_info *pld = NULL; - bool status = false; -#if IS_ENABLED(CONFIG_ACPI) - status = acpi_get_physical_device_location(adev->handle, &pld); -#endif - if (!status) { + if (!acpi_get_physical_device_location(ACPI_PTR(adev->handle), &pld)) { dev_warn(ADEV_DEV(adev), "_PLD call failed, using default orientation\n"); return V4L2_FWNODE_ORIENTATION_EXTERNAL; } @@ -498,9 +488,7 @@ static void ipu_bridge_create_connection_swnodes(struct ipu_bridge *bridge, if (sensor->csi_dev) { const char *device_hid = ""; -#if IS_ENABLED(CONFIG_ACPI) device_hid = acpi_device_hid(sensor->ivsc_adev); -#endif snprintf(sensor->ivsc_name, sizeof(sensor->ivsc_name), "%s-%u", device_hid, sensor->link); @@ -671,11 +659,7 @@ static int ipu_bridge_connect_sensor(const struct ipu_sensor_config *cfg, struct acpi_device *adev = NULL; int ret; -#if IS_ENABLED(CONFIG_ACPI) for_each_acpi_dev_match(adev, cfg->hid, NULL, -1) { -#else - while (true) { -#endif if (!ACPI_PTR(adev->status.enabled)) continue; @@ -768,15 +752,10 @@ static int ipu_bridge_ivsc_is_ready(void) unsigned int i; for (i = 0; i < ARRAY_SIZE(ipu_supported_sensors); i++) { -#if IS_ENABLED(CONFIG_ACPI) const struct ipu_sensor_config *cfg = &ipu_supported_sensors[i]; for_each_acpi_dev_match(sensor_adev, cfg->hid, NULL, -1) { -#else - while (true) { - sensor_adev = NULL; -#endif if (!ACPI_PTR(sensor_adev->status.enabled)) continue; From 17159601d2ec40b849d3f12713eb6f92093b6748 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 19 Dec 2024 17:21:48 +0200 Subject: [PATCH 54/70] ACPI: property: Consider data nodes as being available New functions making use of the data node availability information, like fwnode_for_each_available_child_node(), have been added years after fwnode_device_is_available() was introduced. To enumerate the data nodes in various ways specific to those functions, the node availability test needs to pass. On ACPI, there is no explicit data node availbility information in the first place and the original fwnode_device_is_available() implementation simply returns false. This causes new functions that only enumerate available nodes to never return any nodes on ACPI for leaf devices that have child data nodes. However, on the DT side, fwnode_device_is_available() returns true for all nodes without the "status" property which are analogous to the ACPI data nodes, so there is a difference in behavior between DT and ACPI in that respect. Thus from now on, return true from fwnode_device_is_available() on all ACPI data nodes. Signed-off-by: Sakari Ailus Link: https://patch.msgid.link/20241219152148.975622-1-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 80a52a4e66dd..1ee81e771ae6 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1492,7 +1492,7 @@ acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode) static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (!is_acpi_device_node(fwnode)) - return false; + return true; return acpi_device_is_present(to_acpi_device_node(fwnode)); } From dec2f97a1571ed28ddbadf4431afc5e5872a10df Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Dec 2024 13:09:50 -0600 Subject: [PATCH 55/70] cpupower: Remove spurious return statement print_duration() has a return; statement at the end of the function that is not necessary as it's a void function. Link: https://lore.kernel.org/r/20241218191144.3440854-2-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index c96b77365c63..5f092f3c729e 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -120,7 +120,6 @@ static void print_duration(unsigned long duration) } else printf("%lu ns", duration); } - return; } static int get_boost_mode_x86(unsigned int cpu) From 3f2eb7606eee37aea630c4b7aa42497bc36ca157 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Dec 2024 13:09:51 -0600 Subject: [PATCH 56/70] cpupower: Add support for parsing 'enabled' or 'disabled' strings from table When cpufreq_get_sysfs_value_from_table() is passed a table with kernel strings that report 'enabled' or 'disabled' it always returns 0 because these can't cleanly convert to integers. Explicitly look for enabled or disabled strings from the kernel to handle this. Link: https://lore.kernel.org/r/20241218191144.3440854-3-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpufreq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/power/cpupower/lib/cpufreq.c b/tools/power/cpupower/lib/cpufreq.c index 1516d23c17c9..f27ee6d4b000 100644 --- a/tools/power/cpupower/lib/cpufreq.c +++ b/tools/power/cpupower/lib/cpufreq.c @@ -102,6 +102,10 @@ unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu, if (len == 0) return 0; + if (!strcmp(linebuf, "enabled\n")) + return 1; + if (!strcmp(linebuf, "disabled\n")) + return 0; value = strtoul(linebuf, &endp, 0); if (endp == linebuf || errno == ERANGE) From 6d4a2987f96b9f281b07286eeb1d4022054e1ecd Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Dec 2024 13:09:52 -0600 Subject: [PATCH 57/70] cpupower: Add support for amd-pstate preferred core rankings The rankings are useful information to determine if the scheduler is placing tasks appropriately for the hardware. Link: https://lore.kernel.org/r/20241218191144.3440854-4-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 0a56e22240fc..090fdd6d1551 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -177,6 +177,8 @@ enum amd_pstate_value { AMD_PSTATE_HIGHEST_PERF, AMD_PSTATE_MAX_FREQ, AMD_PSTATE_LOWEST_NONLINEAR_FREQ, + AMD_PSTATE_HW_PREFCORE, + AMD_PSTATE_PREFCORE_RANKING, MAX_AMD_PSTATE_VALUE_READ_FILES, }; @@ -184,6 +186,8 @@ static const char *amd_pstate_value_files[MAX_AMD_PSTATE_VALUE_READ_FILES] = { [AMD_PSTATE_HIGHEST_PERF] = "amd_pstate_highest_perf", [AMD_PSTATE_MAX_FREQ] = "amd_pstate_max_freq", [AMD_PSTATE_LOWEST_NONLINEAR_FREQ] = "amd_pstate_lowest_nonlinear_freq", + [AMD_PSTATE_HW_PREFCORE] = "amd_pstate_hw_prefcore", + [AMD_PSTATE_PREFCORE_RANKING] = "amd_pstate_prefcore_ranking", }; static unsigned long amd_pstate_get_data(unsigned int cpu, @@ -240,6 +244,10 @@ void amd_pstate_show_perf_and_freq(unsigned int cpu, int no_rounding) acpi_cppc_get_data(cpu, LOWEST_PERF)); print_speed(acpi_cppc_get_data(cpu, LOWEST_FREQ) * 1000, no_rounding); printf(".\n"); + + printf(_(" Preferred Core Support: %lu. Preferred Core Ranking: %lu.\n"), + amd_pstate_get_data(cpu, AMD_PSTATE_HW_PREFCORE), + amd_pstate_get_data(cpu, AMD_PSTATE_PREFCORE_RANKING)); } /* AMD P-State Helper Functions ************************************/ From 26e16174f54d40a3774614c4d43966572ed79dc1 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Dec 2024 13:09:53 -0600 Subject: [PATCH 58/70] cpupower: Don't try to read frequency from hardware when kernel uses aperfmperf When the amd-pstate is in use frequency is set by the hardware and measured by the kernel through using the aperf and mperf registers. There is no direct call to the hardware to indicate current frequency. Detect that this feature is in use and skip the check. Link: https://lore.kernel.org/r/20241218191144.3440854-5-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index 5f092f3c729e..3df28e45be42 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -254,7 +254,12 @@ static int get_freq_kernel(unsigned int cpu, unsigned int human) static int get_freq_hardware(unsigned int cpu, unsigned int human) { - unsigned long freq = cpufreq_get_freq_hardware(cpu); + unsigned long freq; + + if (cpupower_cpu_info.caps & CPUPOWER_CAP_APERF) + return -EINVAL; + + freq = cpufreq_get_freq_hardware(cpu); printf(_(" current CPU frequency: ")); if (!freq) { printf("Unable to call hardware\n"); From 5f567afc283fc9e7c6a34d013c4fc6c5e8d6afae Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Dec 2024 13:09:54 -0600 Subject: [PATCH 59/70] cpupower: Add support for showing energy performance preference The EPP value is useful for characterization of performance. Show it in cpupower frequency-info output. Link: https://lore.kernel.org/r/20241218191144.3440854-6-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpufreq.c | 14 +++++++++++++ tools/power/cpupower/lib/cpufreq.h | 8 ++++++++ tools/power/cpupower/utils/cpufreq-info.c | 25 ++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/tools/power/cpupower/lib/cpufreq.c b/tools/power/cpupower/lib/cpufreq.c index f27ee6d4b000..8dda3db2dff0 100644 --- a/tools/power/cpupower/lib/cpufreq.c +++ b/tools/power/cpupower/lib/cpufreq.c @@ -127,12 +127,14 @@ static unsigned long sysfs_cpufreq_get_one_value(unsigned int cpu, enum cpufreq_string { SCALING_DRIVER, SCALING_GOVERNOR, + ENERGY_PERFORMANCE_PREFERENCE, MAX_CPUFREQ_STRING_FILES }; static const char *cpufreq_string_files[MAX_CPUFREQ_STRING_FILES] = { [SCALING_DRIVER] = "scaling_driver", [SCALING_GOVERNOR] = "scaling_governor", + [ENERGY_PERFORMANCE_PREFERENCE] = "energy_performance_preference", }; @@ -207,6 +209,18 @@ unsigned long cpufreq_get_transition_latency(unsigned int cpu) return sysfs_cpufreq_get_one_value(cpu, CPUINFO_LATENCY); } +char *cpufreq_get_energy_performance_preference(unsigned int cpu) +{ + return sysfs_cpufreq_get_one_string(cpu, ENERGY_PERFORMANCE_PREFERENCE); +} + +void cpufreq_put_energy_performance_preference(char *ptr) +{ + if (!ptr) + return; + free(ptr); +} + int cpufreq_get_hardware_limits(unsigned int cpu, unsigned long *min, unsigned long *max) diff --git a/tools/power/cpupower/lib/cpufreq.h b/tools/power/cpupower/lib/cpufreq.h index 2f3c84035806..bfc617311ebd 100644 --- a/tools/power/cpupower/lib/cpufreq.h +++ b/tools/power/cpupower/lib/cpufreq.h @@ -68,6 +68,14 @@ unsigned long cpufreq_get_freq_hardware(unsigned int cpu); unsigned long cpufreq_get_transition_latency(unsigned int cpu); +/* determine energy performance preference + * + * returns NULL on failure, else the string that represents the energy performance + * preference requested. + */ +char *cpufreq_get_energy_performance_preference(unsigned int cpu); +void cpufreq_put_energy_performance_preference(char *ptr); + /* determine hardware CPU frequency limits * * These may be limited further by thermal, energy or other diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index 3df28e45be42..eb9cc0f10634 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -422,6 +422,23 @@ static int get_freq_stats(unsigned int cpu, unsigned int human) return 0; } +/* --epp / -z */ + +static int get_epp(unsigned int cpu, bool interactive) +{ + char *epp; + + epp = cpufreq_get_energy_performance_preference(cpu); + if (!epp) + return -EINVAL; + if (interactive) + printf(_(" energy performance preference: %s\n"), epp); + + cpufreq_put_energy_performance_preference(epp); + + return 0; +} + /* --latency / -y */ static int get_latency(unsigned int cpu, unsigned int human) @@ -461,6 +478,7 @@ static void debug_output_one(unsigned int cpu) get_related_cpus(cpu); get_affected_cpus(cpu); get_latency(cpu, 1); + get_epp(cpu, true); get_hardware_limits(cpu, 1); freqs = cpufreq_get_available_frequencies(cpu); @@ -501,6 +519,7 @@ static struct option info_opts[] = { {"human", no_argument, NULL, 'm'}, {"no-rounding", no_argument, NULL, 'n'}, {"performance", no_argument, NULL, 'c'}, + {"epp", no_argument, NULL, 'z'}, { }, }; @@ -514,7 +533,7 @@ int cmd_freq_info(int argc, char **argv) int output_param = 0; do { - ret = getopt_long(argc, argv, "oefwldpgrasmybnc", info_opts, + ret = getopt_long(argc, argv, "oefwldpgrasmybncz", info_opts, NULL); switch (ret) { case '?': @@ -538,6 +557,7 @@ int cmd_freq_info(int argc, char **argv) case 's': case 'y': case 'c': + case 'z': if (output_param) { output_param = -1; cont = 0; @@ -647,6 +667,9 @@ int cmd_freq_info(int argc, char **argv) case 'c': ret = get_perf_cap(cpu); break; + case 'z': + ret = get_epp(cpu, true); + break; } if (ret) return ret; From acf71265e4c0289e23ee1b66fc0977478edea9a5 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Dec 2024 13:09:55 -0600 Subject: [PATCH 60/70] cpupower: Don't fetch maximum latency when EPP is enabled When EPP has been enabled the hardware will autonomously change frequencies on it's own and thus there is no latency with changing from the kernel. Avoid doing the maximum latency check when EPP is found. This will apply to both amd-pstate and intel-pstate drivers. Link: https://lore.kernel.org/r/20241218191144.3440854-7-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index eb9cc0f10634..fc750e127404 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -445,6 +445,9 @@ static int get_latency(unsigned int cpu, unsigned int human) { unsigned long latency = cpufreq_get_transition_latency(cpu); + if (!get_epp(cpu, false)) + return -EINVAL; + printf(_(" maximum transition latency: ")); if (!latency || latency == UINT_MAX) { printf(_(" Cannot determine or is not supported.\n")); From 8395d43949790f1671621975730a07c264ef3e6f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Dec 2024 13:09:56 -0600 Subject: [PATCH 61/70] cpupower: Adjust whitespace for amd-pstate specific prints The amd-pstate section is grouped under boost, which isn't appropriate. Adjust the indentation so that it is it's own section. Link: https://lore.kernel.org/r/20241218191144.3440854-8-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 090fdd6d1551..795562e879de 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -219,7 +219,9 @@ void amd_pstate_boost_init(unsigned int cpu, int *support, int *active) void amd_pstate_show_perf_and_freq(unsigned int cpu, int no_rounding) { - printf(_(" AMD PSTATE Highest Performance: %lu. Maximum Frequency: "), + + printf(_(" amd-pstate limits:\n")); + printf(_(" Highest Performance: %lu. Maximum Frequency: "), amd_pstate_get_data(cpu, AMD_PSTATE_HIGHEST_PERF)); /* * If boost isn't active, the cpuinfo_max doesn't indicate real max @@ -228,19 +230,19 @@ void amd_pstate_show_perf_and_freq(unsigned int cpu, int no_rounding) print_speed(amd_pstate_get_data(cpu, AMD_PSTATE_MAX_FREQ), no_rounding); printf(".\n"); - printf(_(" AMD PSTATE Nominal Performance: %lu. Nominal Frequency: "), + printf(_(" Nominal Performance: %lu. Nominal Frequency: "), acpi_cppc_get_data(cpu, NOMINAL_PERF)); print_speed(acpi_cppc_get_data(cpu, NOMINAL_FREQ) * 1000, no_rounding); printf(".\n"); - printf(_(" AMD PSTATE Lowest Non-linear Performance: %lu. Lowest Non-linear Frequency: "), + printf(_(" Lowest Non-linear Performance: %lu. Lowest Non-linear Frequency: "), acpi_cppc_get_data(cpu, LOWEST_NONLINEAR_PERF)); print_speed(amd_pstate_get_data(cpu, AMD_PSTATE_LOWEST_NONLINEAR_FREQ), no_rounding); printf(".\n"); - printf(_(" AMD PSTATE Lowest Performance: %lu. Lowest Frequency: "), + printf(_(" Lowest Performance: %lu. Lowest Frequency: "), acpi_cppc_get_data(cpu, LOWEST_PERF)); print_speed(acpi_cppc_get_data(cpu, LOWEST_FREQ) * 1000, no_rounding); printf(".\n"); From 6de02569a2bb678db04236fdf29814c0c27f5121 Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Wed, 18 Dec 2024 20:26:02 -0500 Subject: [PATCH 62/70] pm: cpupower: Add install and uninstall options to bindings makefile Installs the .so and .py files generated by SWIG to system's site packages directory. This allows the Python bindings to be used system wide. This commit also includes documentation on setting up and installing the Python bindings. Link: https://lore.kernel.org/r/20241219012606.38963-1-jwyatt@redhat.com Signed-off-by: "John B. Wyatt IV" Signed-off-by: "John B. Wyatt IV" Signed-off-by: Shuah Khan --- tools/power/cpupower/bindings/python/Makefile | 10 ++++++++ tools/power/cpupower/bindings/python/README | 25 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile index e1ebb1d60cd4..741f21477432 100644 --- a/tools/power/cpupower/bindings/python/Makefile +++ b/tools/power/cpupower/bindings/python/Makefile @@ -11,6 +11,7 @@ HAVE_PYCONFIG := $(shell if which python-config >/dev/null 2>&1; then echo 1; el LIB_DIR := ../../lib PY_INCLUDE = $(firstword $(shell python-config --includes)) OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o) +INSTALL_DIR = $(shell python3 -c "import site; print(site.getsitepackages()[0])") all: _raw_pylibcpupower.so @@ -28,6 +29,15 @@ else ifeq ($(HAVE_PYCONFIG),0) endif swig -python raw_pylibcpupower.swg +# Only installs the Python bindings +install: _raw_pylibcpupower.so + install -D _raw_pylibcpupower.so $(INSTALL_DIR)/_raw_pylibcpupower.so + install -D raw_pylibcpupower.py $(INSTALL_DIR)/raw_pylibcpupower.py + +uninstall: + rm -f $(INSTALL_DIR)/_raw_pylibcpupower.so + rm -f $(INSTALL_DIR)/raw_pylibcpupower.py + # Will only clean the bindings folder; will not clean the actual cpupower folder clean: rm -f raw_pylibcpupower.py raw_pylibcpupower_wrap.c raw_pylibcpupower_wrap.o _raw_pylibcpupower.so diff --git a/tools/power/cpupower/bindings/python/README b/tools/power/cpupower/bindings/python/README index 0a4bb2581e8a..952e2e02fd32 100644 --- a/tools/power/cpupower/bindings/python/README +++ b/tools/power/cpupower/bindings/python/README @@ -48,6 +48,31 @@ To run the test script: $ python test_raw_pylibcpupower.py +developing/using the bindings directly +-------------------------------------- + +You need to add the Python bindings directory to your $PYTHONPATH. + +You would set the path in the Bash terminal or in the Bash profile: + +PYTHONPATH=~/linux/tools/power/cpupower/bindings/python:$PYTHONPATH + +This allows you to set a specific repo of the bindings to use. + + +installing/uninstalling +----------------------- + +Python uses a system specific site-packages folder to look up modules to import +by default. You do not need to install cpupower to use the SWIG bindings. + +You can install and uninstall the bindings to the site-packages with: + +sudo make install + +sudo make uninstall + + credits ------- From 8d097444982d7b23a5396169dc9d2923a59b5a79 Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Tue, 24 Dec 2024 01:23:28 -0500 Subject: [PATCH 63/70] pm: cpupower: Add header changes for cpufreq.h to SWIG bindings "cpupower: Add support for showing energy performance preference" added two new functions to cpufreq.h. This patch adds them to the bindings. Link: https://lore.kernel.org/linux-pm/8dc731c3-6586-4265-ae6a-d93ed219a963@linuxfoundation.org/T/#t Tested by compiling both libcpupower and the headers; running the test script that does not use the functions as a basic sanity test. Link: https://lore.kernel.org/r/20241224062329.39606-1-jwyatt@redhat.com Signed-off-by: "John B. Wyatt IV" Signed-off-by: "John B. Wyatt IV" Signed-off-by: Shuah Khan --- tools/power/cpupower/bindings/python/raw_pylibcpupower.swg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg index 96556d87a745..a8226c79cfea 100644 --- a/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg +++ b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg @@ -134,6 +134,9 @@ void cpufreq_put_stats(struct cpufreq_stats *stats); unsigned long cpufreq_get_transitions(unsigned int cpu); +char *cpufreq_get_energy_performance_preference(unsigned int cpu); +void cpufreq_put_energy_performance_preference(char *ptr); + int cpufreq_set_policy(unsigned int cpu, struct cpufreq_policy *policy); int cpufreq_modify_policy_min(unsigned int cpu, unsigned long min_freq); From fd604ae6c261c5a56bb977ae99f875bbd7264a3f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 2 Jan 2025 08:12:04 -0600 Subject: [PATCH 64/70] cpufreq/amd-pstate: Fix prefcore rankings commit 50a062a76200 ("cpufreq/amd-pstate: Store the boost numerator as highest perf again") updated the value stored for highest perf to no longer store the highest perf value but instead the boost numerator. This is a fixed value for systems with preferred cores and not appropriate for use ITMT rankings. Update the value used for ITMT rankings to be the preferred core ranking. Reported-and-tested-by: Sebastian Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219640 Fixes: 50a062a76200 ("cpufreq/amd-pstate: Store the boost numerator as highest perf again") Reviewed-by: Dhananjay Ugwekar Link: https://lore.kernel.org/r/20250102141204.3413202-1-superm1@kernel.org Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d7b1de97727a..2330903a8b45 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -815,7 +815,7 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) * sched_set_itmt_support(true) has been called and it is valid to * update them at any time after it has been called. */ - sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu); + sched_set_itmt_core_prio((int)READ_ONCE(cpudata->prefcore_ranking), cpudata->cpu); schedule_work(&sched_prefcore_work); } From 857a61c2ce74e30fc3b10bc89d68ddd8d05b188c Mon Sep 17 00:00:00 2001 From: Naresh Solanki Date: Fri, 20 Dec 2024 01:48:32 +0530 Subject: [PATCH 65/70] cpufreq/amd-pstate: Refactor max frequency calculation The previous approach introduced roundoff errors during division when calculating the boost ratio. This, in turn, affected the maximum frequency calculation, often resulting in reporting lower frequency values. For example, on the Glinda SoC based board with the following parameters: max_perf = 208 nominal_perf = 100 nominal_freq = 2600 MHz The Linux kernel previously calculated the frequency as: freq = ((max_perf * 1024 / nominal_perf) * nominal_freq) / 1024 freq = 5405 MHz // Integer arithmetic. With the updated formula: freq = (max_perf * nominal_freq) / nominal_perf freq = 5408 MHz This change ensures more accurate frequency calculations by eliminating unnecessary shifts and divisions, thereby improving precision. Signed-off-by: Naresh Solanki [ML: trim the changelog from commit message] Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20241219201833.2750998-1-naresh.solanki@9elements.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2330903a8b45..dd9b8d6993d6 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -908,9 +908,8 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { int ret; u32 min_freq, max_freq; - u32 nominal_perf, nominal_freq; + u32 highest_perf, nominal_perf, nominal_freq; u32 lowest_nonlinear_perf, lowest_nonlinear_freq; - u32 boost_ratio, lowest_nonlinear_ratio; struct cppc_perf_caps cppc_perf; ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); @@ -927,16 +926,12 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) else nominal_freq = cppc_perf.nominal_freq; + highest_perf = READ_ONCE(cpudata->highest_perf); nominal_perf = READ_ONCE(cpudata->nominal_perf); - - boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); - max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT); + max_freq = div_u64((u64)highest_perf * nominal_freq, nominal_perf); lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); - lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, - nominal_perf); - lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT); - + lowest_nonlinear_freq = div_u64((u64)nominal_freq * lowest_nonlinear_perf, nominal_perf); WRITE_ONCE(cpudata->min_freq, min_freq * 1000); WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000); WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000); From bff7165da9c1961e44b2fd6c8f80c8704184ae11 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Fri, 3 Jan 2025 09:21:22 +0530 Subject: [PATCH 66/70] ACPI: tables: Use string choice helpers Use str_enabled_disabled string helpers for better readability and to fix cocci warning. Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202501010947.0e3GVHNa-lkp@intel.com/ Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202501010947.0e3GVHNa-lkp@intel.com/ Signed-off-by: Sunil V L Link: https://patch.msgid.link/20250103035122.50315-1-sunilvl@ventanamicro.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/tables.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 9e1b01c35070..2295abbecd14 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -56,7 +56,7 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) (struct acpi_madt_local_apic *)header; pr_debug("LAPIC (acpi_id[0x%02x] lapic_id[0x%02x] %s)\n", p->processor_id, p->id, - (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + str_enabled_disabled(p->lapic_flags & ACPI_MADT_ENABLED)); } break; @@ -66,7 +66,7 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) (struct acpi_madt_local_x2apic *)header; pr_debug("X2APIC (apic_id[0x%02x] uid[0x%02x] %s)\n", p->local_apic_id, p->uid, - (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + str_enabled_disabled(p->lapic_flags & ACPI_MADT_ENABLED)); } break; @@ -160,7 +160,7 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) (struct acpi_madt_local_sapic *)header; pr_debug("LSAPIC (acpi_id[0x%02x] lsapic_id[0x%02x] lsapic_eid[0x%02x] %s)\n", p->processor_id, p->id, p->eid, - (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + str_enabled_disabled(p->lapic_flags & ACPI_MADT_ENABLED)); } break; @@ -183,7 +183,7 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) pr_debug("GICC (acpi_id[0x%04x] address[%llx] MPIDR[0x%llx] %s)\n", p->uid, p->base_address, p->arm_mpidr, - (p->flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + str_enabled_disabled(p->flags & ACPI_MADT_ENABLED)); } break; @@ -218,7 +218,7 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) pr_debug("CORE PIC (processor_id[0x%02x] core_id[0x%02x] %s)\n", p->processor_id, p->core_id, - (p->flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + str_enabled_disabled(p->flags & ACPI_MADT_ENABLED)); } break; @@ -228,7 +228,7 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) pr_debug("RISC-V INTC (acpi_uid[0x%04x] hart_id[0x%llx] %s)\n", p->uid, p->hart_id, - (p->flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + str_enabled_disabled(p->flags & ACPI_MADT_ENABLED)); } break; From f3253b23535fda2436b2d5a3172260a75ca64091 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sat, 11 Jan 2025 02:35:20 +0900 Subject: [PATCH 67/70] PM / devfreq: exynos: remove unused function parameter exynos_bus_parse_of() still declares a parameter struct device_node that is not used yet. This parameter is unnecessary and should be removed. Signed-off-by: Jeongjun Park --- drivers/devfreq/exynos-bus.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c index 7d06c476d8e9..b9ea7ad2e51b 100644 --- a/drivers/devfreq/exynos-bus.c +++ b/drivers/devfreq/exynos-bus.c @@ -236,8 +236,7 @@ err_regulator: return ret; } -static int exynos_bus_parse_of(struct device_node *np, - struct exynos_bus *bus) +static int exynos_bus_parse_of(struct exynos_bus *bus) { struct device *dev = bus->dev; struct dev_pm_opp *opp; @@ -408,7 +407,7 @@ static int exynos_bus_probe(struct platform_device *pdev) } /* Parse the device-tree to get the resource information */ - ret = exynos_bus_parse_of(np, bus); + ret = exynos_bus_parse_of(bus); if (ret < 0) goto err_reg; From 56098a4505e7ba54c7942c82eeca8de522c8c2ee Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 10 Jan 2025 13:46:43 +0100 Subject: [PATCH 68/70] cpuidle: menu: Update documentation after previous changes After commit 38f83090f515 ("cpuidle: menu: Remove iowait influence") and other previous changes, the description of the menu governor in the documentation does not match the code any more, so update it as appropriate. Fixes: 38f83090f515 ("cpuidle: menu: Remove iowait influence") Fixes: 5484e31bbbff ("cpuidle: menu: Skip tick_nohz_get_sleep_length() call in some cases") Reported-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/12589281.O9o76ZdvQC@rjwysocki.net --- Documentation/admin-guide/pm/cpuidle.rst | 70 ++++++++++-------------- 1 file changed, 30 insertions(+), 40 deletions(-) diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index 19754beb5a4e..eb58d7a5affd 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -269,27 +269,7 @@ Namely, when invoked to select an idle state for a CPU (i.e. an idle state that the CPU will ask the processor hardware to enter), it attempts to predict the idle duration and uses the predicted value for idle state selection. -It first obtains the time until the closest timer event with the assumption -that the scheduler tick will be stopped. That time, referred to as the *sleep -length* in what follows, is the upper bound on the time before the next CPU -wakeup. It is used to determine the sleep length range, which in turn is needed -to get the sleep length correction factor. - -The ``menu`` governor maintains two arrays of sleep length correction factors. -One of them is used when tasks previously running on the given CPU are waiting -for some I/O operations to complete and the other one is used when that is not -the case. Each array contains several correction factor values that correspond -to different sleep length ranges organized so that each range represented in the -array is approximately 10 times wider than the previous one. - -The correction factor for the given sleep length range (determined before -selecting the idle state for the CPU) is updated after the CPU has been woken -up and the closer the sleep length is to the observed idle duration, the closer -to 1 the correction factor becomes (it must fall between 0 and 1 inclusive). -The sleep length is multiplied by the correction factor for the range that it -falls into to obtain the first approximation of the predicted idle duration. - -Next, the governor uses a simple pattern recognition algorithm to refine its +It first uses a simple pattern recognition algorithm to obtain a preliminary idle duration prediction. Namely, it saves the last 8 observed idle duration values and, when predicting the idle duration next time, it computes the average and variance of them. If the variance is small (smaller than 400 square @@ -301,29 +281,39 @@ Again, if the variance of them is small (in the above sense), the average is taken as the "typical interval" value and so on, until either the "typical interval" is determined or too many data points are disregarded, in which case the "typical interval" is assumed to equal "infinity" (the maximum unsigned -integer value). The "typical interval" computed this way is compared with the -sleep length multiplied by the correction factor and the minimum of the two is -taken as the predicted idle duration. +integer value). -Then, the governor computes an extra latency limit to help "interactive" -workloads. It uses the observation that if the exit latency of the selected -idle state is comparable with the predicted idle duration, the total time spent -in that state probably will be very short and the amount of energy to save by -entering it will be relatively small, so likely it is better to avoid the -overhead related to entering that state and exiting it. Thus selecting a -shallower state is likely to be a better option then. The first approximation -of the extra latency limit is the predicted idle duration itself which -additionally is divided by a value depending on the number of tasks that -previously ran on the given CPU and now they are waiting for I/O operations to -complete. The result of that division is compared with the latency limit coming -from the power management quality of service, or `PM QoS `_, -framework and the minimum of the two is taken as the limit for the idle states' -exit latency. +If the "typical interval" computed this way is long enough, the governor obtains +the time until the closest timer event with the assumption that the scheduler +tick will be stopped. That time, referred to as the *sleep length* in what follows, +is the upper bound on the time before the next CPU wakeup. It is used to determine +the sleep length range, which in turn is needed to get the sleep length correction +factor. + +The ``menu`` governor maintains an array containing several correction factor +values that correspond to different sleep length ranges organized so that each +range represented in the array is approximately 10 times wider than the previous +one. + +The correction factor for the given sleep length range (determined before +selecting the idle state for the CPU) is updated after the CPU has been woken +up and the closer the sleep length is to the observed idle duration, the closer +to 1 the correction factor becomes (it must fall between 0 and 1 inclusive). +The sleep length is multiplied by the correction factor for the range that it +falls into to obtain an approximation of the predicted idle duration that is +compared to the "typical interval" determined previously and the minimum of +the two is taken as the idle duration prediction. + +If the "typical interval" value is small, which means that the CPU is likely +to be woken up soon enough, the sleep length computation is skipped as it may +be costly and the idle duration is simply predicted to equal the "typical +interval" value. Now, the governor is ready to walk the list of idle states and choose one of them. For this purpose, it compares the target residency of each state with -the predicted idle duration and the exit latency of it with the computed latency -limit. It selects the state with the target residency closest to the predicted +the predicted idle duration and the exit latency of it with the with the latency +limit coming from the power management quality of service, or `PM QoS `_, +framework. It selects the state with the target residency closest to the predicted idle duration, but still below it, and exit latency that does not exceed the limit. From 5a597a19a2148d1c5cd987907a60c042ab0f62d5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 10 Jan 2025 13:48:10 +0100 Subject: [PATCH 69/70] cpuidle: teo: Update documentation after previous changes After previous changes, the description of the teo governor in the documentation comment does not match the code any more, so update it as appropriate. Fixes: 449914398083 ("cpuidle: teo: Remove recent intercepts metric") Fixes: 2662342079f5 ("cpuidle: teo: Gather statistics regarding whether or not to stop the tick") Fixes: 6da8f9ba5a87 ("cpuidle: teo: Skip tick_nohz_get_sleep_length() call in some cases") Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/6120335.lOV4Wx5bFT@rjwysocki.net [ rjw: Corrected 3 typos found by Christian ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 91 +++++++++++++++++---------------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index f2992f92d8db..173ddcac540a 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -10,25 +10,27 @@ * DOC: teo-description * * The idea of this governor is based on the observation that on many systems - * timer events are two or more orders of magnitude more frequent than any - * other interrupts, so they are likely to be the most significant cause of CPU - * wakeups from idle states. Moreover, information about what happened in the - * (relatively recent) past can be used to estimate whether or not the deepest - * idle state with target residency within the (known) time till the closest - * timer event, referred to as the sleep length, is likely to be suitable for - * the upcoming CPU idle period and, if not, then which of the shallower idle - * states to choose instead of it. + * timer interrupts are two or more orders of magnitude more frequent than any + * other interrupt types, so they are likely to dominate CPU wakeup patterns. + * Moreover, in principle, the time when the next timer event is going to occur + * can be determined at the idle state selection time, although doing that may + * be costly, so it can be regarded as the most reliable source of information + * for idle state selection. * - * Of course, non-timer wakeup sources are more important in some use cases - * which can be covered by taking a few most recent idle time intervals of the - * CPU into account. However, even in that context it is not necessary to - * consider idle duration values greater than the sleep length, because the - * closest timer will ultimately wake up the CPU anyway unless it is woken up - * earlier. + * Of course, non-timer wakeup sources are more important in some use cases, + * but even then it is generally unnecessary to consider idle duration values + * greater than the time time till the next timer event, referred as the sleep + * length in what follows, because the closest timer will ultimately wake up the + * CPU anyway unless it is woken up earlier. * - * Thus this governor estimates whether or not the prospective idle duration of - * a CPU is likely to be significantly shorter than the sleep length and selects - * an idle state for it accordingly. + * However, since obtaining the sleep length may be costly, the governor first + * checks if it can select a shallow idle state using wakeup pattern information + * from recent times, in which case it can do without knowing the sleep length + * at all. For this purpose, it counts CPU wakeup events and looks for an idle + * state whose target residency has not exceeded the idle duration (measured + * after wakeup) in the majority of relevant recent cases. If the target + * residency of that state is small enough, it may be used right away and the + * sleep length need not be determined. * * The computations carried out by this governor are based on using bins whose * boundaries are aligned with the target residency parameter values of the CPU @@ -39,7 +41,11 @@ * idle state 2, the third bin spans from the target residency of idle state 2 * up to, but not including, the target residency of idle state 3 and so on. * The last bin spans from the target residency of the deepest idle state - * supplied by the driver to infinity. + * supplied by the driver to the scheduler tick period length or to infinity if + * the tick period length is less than the target residency of that state. In + * the latter case, the governor also counts events with the measured idle + * duration between the tick period length and the target residency of the + * deepest idle state. * * Two metrics called "hits" and "intercepts" are associated with each bin. * They are updated every time before selecting an idle state for the given CPU @@ -49,47 +55,46 @@ * sleep length and the idle duration measured after CPU wakeup fall into the * same bin (that is, the CPU appears to wake up "on time" relative to the sleep * length). In turn, the "intercepts" metric reflects the relative frequency of - * situations in which the measured idle duration is so much shorter than the - * sleep length that the bin it falls into corresponds to an idle state - * shallower than the one whose bin is fallen into by the sleep length (these - * situations are referred to as "intercepts" below). + * non-timer wakeup events for which the measured idle duration falls into a bin + * that corresponds to an idle state shallower than the one whose bin is fallen + * into by the sleep length (these events are also referred to as "intercepts" + * below). * * In order to select an idle state for a CPU, the governor takes the following * steps (modulo the possible latency constraint that must be taken into account * too): * - * 1. Find the deepest CPU idle state whose target residency does not exceed - * the current sleep length (the candidate idle state) and compute 2 sums as - * follows: + * 1. Find the deepest enabled CPU idle state (the candidate idle state) and + * compute 2 sums as follows: * - * - The sum of the "hits" and "intercepts" metrics for the candidate state - * and all of the deeper idle states (it represents the cases in which the - * CPU was idle long enough to avoid being intercepted if the sleep length - * had been equal to the current one). + * - The sum of the "hits" metric for all of the idle states shallower than + * the candidate one (it represents the cases in which the CPU was likely + * woken up by a timer). * - * - The sum of the "intercepts" metrics for all of the idle states shallower - * than the candidate one (it represents the cases in which the CPU was not - * idle long enough to avoid being intercepted if the sleep length had been - * equal to the current one). + * - The sum of the "intercepts" metric for all of the idle states shallower + * than the candidate one (it represents the cases in which the CPU was + * likely woken up by a non-timer wakeup source). * - * 2. If the second sum is greater than the first one the CPU is likely to wake - * up early, so look for an alternative idle state to select. + * 2. If the second sum computed in step 1 is greater than a half of the sum of + * both metrics for the candidate state bin and all subsequent bins(if any), + * a shallower idle state is likely to be more suitable, so look for it. * - * - Traverse the idle states shallower than the candidate one in the + * - Traverse the enabled idle states shallower than the candidate one in the * descending order. * * - For each of them compute the sum of the "intercepts" metrics over all * of the idle states between it and the candidate one (including the * former and excluding the latter). * - * - If each of these sums that needs to be taken into account (because the - * check related to it has indicated that the CPU is likely to wake up - * early) is greater than a half of the corresponding sum computed in step - * 1 (which means that the target residency of the state in question had - * not exceeded the idle duration in over a half of the relevant cases), - * select the given idle state instead of the candidate one. + * - If this sum is greater than a half of the second sum computed in step 1, + * use the given idle state as the new candidate one. * - * 3. By default, select the candidate state. + * 3. If the current candidate state is state 0 or its target residency is short + * enough, return it and prevent the scheduler tick from being stopped. + * + * 4. Obtain the sleep length value and check if it is below the target + * residency of the current candidate state, in which case a new shallower + * candidate state needs to be found, so look for it. */ #include From 14578923e8c251091d2bb8a2756cde3b662ac316 Mon Sep 17 00:00:00 2001 From: Chris Bainbridge Date: Sat, 11 Jan 2025 18:59:45 +0000 Subject: [PATCH 70/70] ACPI: video: Fix random crashes due to bad kfree() Commit c6a837088bed ("drm/amd/display: Fetch the EDID from _DDC if available for eDP") added function dm_helpers_probe_acpi_edid(), which fetches the EDID from the BIOS by calling acpi_video_get_edid(). acpi_video_get_edid() returns a pointer to the EDID, but this pointer does not originate from kmalloc() - it is actually the internal "pointer" field from an acpi_buffer struct (which did come from kmalloc()). dm_helpers_probe_acpi_edid() then attempts to kfree() the EDID pointer, resulting in memory corruption which leads to random, intermittent crashes (e.g. 4% of boots will fail with some Oops). Fix this by allocating a new array (which can be safely freed) for the EDID data, and correctly freeing the acpi_buffer pointer. The only other caller of acpi_video_get_edid() is nouveau_acpi_edid(): remove the extraneous kmemdup() here as the EDID data is now copied in acpi_video_device_EDID(). Signed-off-by: Chris Bainbridge Fixes: c6a837088bed ("drm/amd/display: Fetch the EDID from _DDC if available for eDP") Reviewed-by: Hans de Goede Reviewed-by: Mario Limonciello Reported-by: Borislav Petkov (AMD) Tested-by: Borislav Petkov (AMD) Closes: https://lore.kernel.org/amd-gfx/20250110175252.GBZ4FedNKqmBRaY4T3@fat_crate.local/T/#m324a23eb4c4c32fa7e89e31f8ba96c781e496fb1 Link: https://patch.msgid.link/Z4K_oQL7eA9Owkbs@debian.local [ rjw: Changed function description comment into a kerneldoc one ] [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 49 ++++++++++++++------------ drivers/gpu/drm/nouveau/nouveau_acpi.c | 2 +- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 8274a17872ed..a972831dbd66 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -610,16 +610,28 @@ acpi_video_device_lcd_get_level_current(struct acpi_video_device *device, return 0; } +/** + * acpi_video_device_EDID() - Get EDID from ACPI _DDC + * @device: video output device (LCD, CRT, ..) + * @edid: address for returned EDID pointer + * @length: _DDC length to request (must be a multiple of 128) + * + * Get EDID from ACPI _DDC. On success, a pointer to the EDID data is written + * to the @edid address, and the length of the EDID is returned. The caller is + * responsible for freeing the edid pointer. + * + * Return the length of EDID (positive value) on success or error (negative + * value). + */ static int -acpi_video_device_EDID(struct acpi_video_device *device, - union acpi_object **edid, int length) +acpi_video_device_EDID(struct acpi_video_device *device, void **edid, int length) { - int status; + acpi_status status; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; union acpi_object arg0 = { ACPI_TYPE_INTEGER }; struct acpi_object_list args = { 1, &arg0 }; - + int ret; *edid = NULL; @@ -636,16 +648,17 @@ acpi_video_device_EDID(struct acpi_video_device *device, obj = buffer.pointer; - if (obj && obj->type == ACPI_TYPE_BUFFER) - *edid = obj; - else { + if (obj && obj->type == ACPI_TYPE_BUFFER) { + *edid = kmemdup(obj->buffer.pointer, obj->buffer.length, GFP_KERNEL); + ret = *edid ? obj->buffer.length : -ENOMEM; + } else { acpi_handle_debug(device->dev->handle, "Invalid _DDC data for length %d\n", length); - status = -EFAULT; - kfree(obj); + ret = -EFAULT; } - return status; + kfree(obj); + return ret; } /* bus */ @@ -1435,9 +1448,7 @@ int acpi_video_get_edid(struct acpi_device *device, int type, int device_id, { struct acpi_video_bus *video; struct acpi_video_device *video_device; - union acpi_object *buffer = NULL; - acpi_status status; - int i, length; + int i, length, ret; if (!device || !acpi_driver_data(device)) return -EINVAL; @@ -1477,16 +1488,10 @@ int acpi_video_get_edid(struct acpi_device *device, int type, int device_id, } for (length = 512; length > 0; length -= 128) { - status = acpi_video_device_EDID(video_device, &buffer, - length); - if (ACPI_SUCCESS(status)) - break; + ret = acpi_video_device_EDID(video_device, edid, length); + if (ret > 0) + return ret; } - if (!length) - continue; - - *edid = buffer->buffer.pointer; - return length; } return -ENODEV; diff --git a/drivers/gpu/drm/nouveau/nouveau_acpi.c b/drivers/gpu/drm/nouveau/nouveau_acpi.c index 8f0c69aad248..21b56cc7605c 100644 --- a/drivers/gpu/drm/nouveau/nouveau_acpi.c +++ b/drivers/gpu/drm/nouveau/nouveau_acpi.c @@ -384,7 +384,7 @@ nouveau_acpi_edid(struct drm_device *dev, struct drm_connector *connector) if (ret < 0) return NULL; - return kmemdup(edid, EDID_LENGTH, GFP_KERNEL); + return edid; } bool nouveau_acpi_video_backlight_use_native(void)