From d3911f1639e67fc7b12aae0efa5a540976d7443b Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Wed, 5 Jun 2024 18:53:27 +0500 Subject: [PATCH 0001/1052] power: supply: rt5033: Bring back i2c_set_clientdata Commit 3a93da231c12 ("power: supply: rt5033: Use devm_power_supply_register() helper") reworked the driver to use devm. While at it, the i2c_set_clientdata was dropped along with the remove callback. Unfortunately other parts of the driver also rely on i2c clientdata so this causes kernel oops. Bring the call back to fix the driver. Fixes: 3a93da231c12 ("power: supply: rt5033: Use devm_power_supply_register() helper") Tested-by: Raymond Hackley Signed-off-by: Nikita Travkin Link: https://lore.kernel.org/r/20240605-rt5033-null-clientdata-v1-1-558d710eeb4d@trvn.ru Signed-off-by: Sebastian Reichel --- drivers/power/supply/rt5033_battery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/power/supply/rt5033_battery.c b/drivers/power/supply/rt5033_battery.c index 32eafe2c00af..7a27b262fb84 100644 --- a/drivers/power/supply/rt5033_battery.c +++ b/drivers/power/supply/rt5033_battery.c @@ -159,6 +159,7 @@ static int rt5033_battery_probe(struct i2c_client *client) return -EINVAL; } + i2c_set_clientdata(client, battery); psy_cfg.of_node = client->dev.of_node; psy_cfg.drv_data = battery; From 9b69b52cdde74a38c5ab4d89405b2bd384ec0155 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 20 May 2024 23:30:05 +0100 Subject: [PATCH 0002/1052] ARM: 9400/1: Remove unused struct 'mod_unwind_map' I think this has been unused since Commit b6f21d14f1ac ("ARM: 9204/2: module: Add all unwind tables when load module") Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/module.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 677f218f7e84..da488d92e7a0 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -395,11 +395,6 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, return 0; } -struct mod_unwind_map { - const Elf_Shdr *unw_sec; - const Elf_Shdr *txt_sec; -}; - static const Elf_Shdr *find_mod_section(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs, const char *name) { From 8ede71e1202011d8bfceeab4737e6d52d88688ab Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 May 2024 13:07:42 +0100 Subject: [PATCH 0003/1052] ARM: 9402/1: Kconfig: Spelling s/Cortex A-/Cortex-A/ Fix a misspelling of "Cortex-A9", to make it easier to find which errata are applicable to Cortex-A9 CPU cores. Signed-off-by: Geert Uytterhoeven Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ee5115252aac..9e0749b22446 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -735,7 +735,7 @@ config ARM_ERRATA_764319 bool "ARM errata: Read to DBGPRSR and DBGOSLSR may generate Undefined instruction" depends on CPU_V7 help - This option enables the workaround for the 764319 Cortex A-9 erratum. + This option enables the workaround for the 764319 Cortex-A9 erratum. CP14 read accesses to the DBGPRSR and DBGOSLSR registers generate an unexpected Undefined Instruction exception when the DBGSWENABLE external pin is set to 0, even when the CP14 accesses are performed From 9ac7ba12d37b4c24f6db675ff7edb7e54f597a59 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 May 2024 13:09:24 +0100 Subject: [PATCH 0004/1052] ARM: 9403/1: Alpine: Spelling s/initialiing/initializing/ Fix a misspelling of "initializing". Signed-off-by: Geert Uytterhoeven Signed-off-by: Russell King (Oracle) --- arch/arm/mach-alpine/alpine_cpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-alpine/alpine_cpu_pm.c b/arch/arm/mach-alpine/alpine_cpu_pm.c index 13ae8412e9ce..b48da6f12b6c 100644 --- a/arch/arm/mach-alpine/alpine_cpu_pm.c +++ b/arch/arm/mach-alpine/alpine_cpu_pm.c @@ -29,7 +29,7 @@ int alpine_cpu_wakeup(unsigned int phys_cpu, uint32_t phys_resume_addr) /* * Set CPU resume address - * secure firmware running on boot will jump to this address - * after setting proper CPU mode, and initialiing e.g. secure + * after setting proper CPU mode, and initializing e.g. secure * regs (the same mode all CPUs are booted to - usually HYP) */ writel(phys_resume_addr, From ed0f941022515ff40473ea5335769a5dc2524a3f Mon Sep 17 00:00:00 2001 From: Yuntao Liu Date: Mon, 3 Jun 2024 16:37:50 +0100 Subject: [PATCH 0005/1052] ARM: 9404/1: arm32: enable HAVE_LD_DEAD_CODE_DATA_ELIMINATION The current arm32 architecture does not yet support the HAVE_LD_DEAD_CODE_DATA_ELIMINATION feature. arm32 is widely used in embedded scenarios, and enabling this feature would be beneficial for reducing the size of the kernel image. In order to make this work, we keep the necessary tables by annotating them with KEEP, also it requires further changes to linker script to KEEP some tables and wildcard compiler generated sections into the right place. When using ld.lld for linking, KEEP is not recognized within the OVERLAY command, and Ard proposed a concise method to solve this problem. It boots normally with defconfig, vexpress_defconfig and tinyconfig. The size comparison of zImage is as follows: defconfig vexpress_defconfig tinyconfig 5137712 5138024 424192 no dce 5032560 4997824 298384 dce 2.0% 2.7% 29.7% shrink When using smaller config file, there is a significant reduction in the size of the zImage. We also tested this patch on a commercially available single-board computer, and the comparison is as follows: a15eb_config 2161384 no dce 2092240 dce 3.2% shrink The zImage size has been reduced by approximately 3.2%, which is 70KB on 2.1M. Signed-off-by: Yuntao Liu Tested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 1 + arch/arm/boot/compressed/vmlinux.lds.S | 2 +- arch/arm/include/asm/vmlinux.lds.h | 2 +- arch/arm/kernel/entry-armv.S | 3 +++ arch/arm/kernel/vmlinux-xip.lds.S | 4 ++-- arch/arm/kernel/vmlinux.lds.S | 6 +++--- drivers/firmware/efi/libstub/Makefile | 4 ++++ 7 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9e0749b22446..08572703d94d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -115,6 +115,7 @@ config ARM select HAVE_KERNEL_XZ select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M select HAVE_KRETPROBES if HAVE_KPROBES + select HAVE_LD_DEAD_CODE_DATA_ELIMINATION select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPTPROBES if !THUMB2_KERNEL diff --git a/arch/arm/boot/compressed/vmlinux.lds.S b/arch/arm/boot/compressed/vmlinux.lds.S index 3fcb3e62dc56..d411abd4310e 100644 --- a/arch/arm/boot/compressed/vmlinux.lds.S +++ b/arch/arm/boot/compressed/vmlinux.lds.S @@ -125,7 +125,7 @@ SECTIONS . = BSS_START; __bss_start = .; - .bss : { *(.bss) } + .bss : { *(.bss .bss.*) } _end = .; . = ALIGN(8); /* the stack must be 64-bit aligned */ diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index 4c8632d5c432..d60f6e83a9f7 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -42,7 +42,7 @@ #define PROC_INFO \ . = ALIGN(4); \ __proc_info_begin = .; \ - *(.proc.info.init) \ + KEEP(*(.proc.info.init)) \ __proc_info_end = .; #define IDMAP_TEXT \ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 6150a716828c..f01d23a220e6 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1065,6 +1065,7 @@ vector_addrexcptn: .globl vector_fiq .section .vectors, "ax", %progbits + .reloc .text, R_ARM_NONE, . W(b) vector_rst W(b) vector_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi ) @@ -1078,6 +1079,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi ) #ifdef CONFIG_HARDEN_BRANCH_HISTORY .section .vectors.bhb.loop8, "ax", %progbits + .reloc .text, R_ARM_NONE, . W(b) vector_rst W(b) vector_bhb_loop8_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi ) @@ -1090,6 +1092,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi ) W(b) vector_bhb_loop8_fiq .section .vectors.bhb.bpiall, "ax", %progbits + .reloc .text, R_ARM_NONE, . W(b) vector_rst W(b) vector_bhb_bpiall_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi ) diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index c16d196b5aad..5eddb75a7174 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -63,7 +63,7 @@ SECTIONS . = ALIGN(4); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; - ARM_MMU_KEEP(*(__ex_table)) + ARM_MMU_KEEP(KEEP(*(__ex_table))) __stop___ex_table = .; } @@ -83,7 +83,7 @@ SECTIONS } .init.arch.info : { __arch_info_begin = .; - *(.arch.info.init) + KEEP(*(.arch.info.init)) __arch_info_end = .; } .init.tagtable : { diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index bd9127c4b451..de373c6c2ae8 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -74,7 +74,7 @@ SECTIONS . = ALIGN(4); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; - ARM_MMU_KEEP(*(__ex_table)) + ARM_MMU_KEEP(KEEP(*(__ex_table))) __stop___ex_table = .; } @@ -99,7 +99,7 @@ SECTIONS } .init.arch.info : { __arch_info_begin = .; - *(.arch.info.init) + KEEP(*(.arch.info.init)) __arch_info_end = .; } .init.tagtable : { @@ -116,7 +116,7 @@ SECTIONS #endif .init.pv_table : { __pv_table_begin = .; - *(.pv_table) + KEEP(*(.pv_table)) __pv_table_end = .; } diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 06f0428a723c..5a8a9b3429b5 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -56,6 +56,10 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) # disable LTO KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) +# The .data section would be renamed to .data.efistub, therefore, remove +# `-fdata-sections` flag from KBUILD_CFLAGS_KERNEL +KBUILD_CFLAGS_KERNEL := $(filter-out -fdata-sections, $(KBUILD_CFLAGS_KERNEL)) + lib-y := efi-stub-helper.o gop.o secureboot.o tpm.o \ file.o mem.o random.o randomalloc.o pci.o \ skip_spaces.o lib-cmdline.o lib-ctype.o \ From 4e7b4ff2dcaed228cb2fb7bfe720262c98ec1bb9 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 27 Jun 2024 08:29:59 +0100 Subject: [PATCH 0006/1052] ARM: 9406/1: Fix callchain_trace() return value perf_callchain_store() return 0 on success, -1 otherwise, fix callchain_trace() to return correct bool value. So walk_stackframe() can have a chance to stop walking the stack ahead. Fixes: 70ccc7c0667b ("ARM: 9258/1: stacktrace: Make stack walk callback consistent with generic code") Signed-off-by: Jinjie Ruan Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/perf_callchain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 7147edbe56c6..1d230ac9d0eb 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -85,8 +85,7 @@ static bool callchain_trace(void *data, unsigned long pc) { struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, pc); - return true; + return perf_callchain_store(entry, pc) == 0; } void From 2335c9cb831faba1a4efcc612886073b6f175fe4 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 27 Jun 2024 08:38:44 +0100 Subject: [PATCH 0007/1052] ARM: 9407/1: Add support for STACKLEAK gcc plugin Add the STACKLEAK gcc plugin to arm32 by adding the helper used by stackleak common code: on_thread_stack(). It initialize the stack with the poison value before returning from system calls which improves the kernel security. Additionally, this disables the plugin in EFI stub code and decompress code, which are out of scope for the protection. Before the test on Qemu versatilepb board: # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT lkdtm: Performing direct entry STACKLEAK_ERASING lkdtm: XFAIL: stackleak is not supported on this arch (HAVE_ARCH_STACKLEAK=n) After: # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT lkdtm: Performing direct entry STACKLEAK_ERASING lkdtm: stackleak stack usage: high offset: 80 bytes current: 280 bytes lowest: 696 bytes tracked: 696 bytes untracked: 192 bytes poisoned: 7220 bytes low offset: 4 bytes lkdtm: OK: the rest of the thread stack is properly erased Signed-off-by: Jinjie Ruan Acked-by: Ard Biesheuvel Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 1 + arch/arm/boot/compressed/Makefile | 1 + arch/arm/include/asm/stacktrace.h | 7 +++++++ arch/arm/kernel/entry-common.S | 3 +++ drivers/firmware/efi/libstub/Makefile | 3 ++- 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 08572703d94d..26a01ad4087c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -86,6 +86,7 @@ config ARM select HAVE_ARCH_PFN_VALID select HAVE_ARCH_SECCOMP select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT + select HAVE_ARCH_STACKLEAK select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 6bca03c0c7f0..945b5975fce2 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -9,6 +9,7 @@ OBJS = HEAD = head.o OBJS += misc.o decompress.o +CFLAGS_decompress.o += $(DISABLE_STACKLEAK_PLUGIN) ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y) OBJS += debug.o AFLAGS_head.o += -DDEBUG diff --git a/arch/arm/include/asm/stacktrace.h b/arch/arm/include/asm/stacktrace.h index 360f0d2406bf..f80a85b091d6 100644 --- a/arch/arm/include/asm/stacktrace.h +++ b/arch/arm/include/asm/stacktrace.h @@ -26,6 +26,13 @@ struct stackframe { #endif }; +static inline bool on_thread_stack(void) +{ + unsigned long delta = current_stack_pointer ^ (unsigned long)current->stack; + + return delta < THREAD_SIZE; +} + static __always_inline void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe *frame) { diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 5c31e9de7a60..f379c852dcb7 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -119,6 +119,9 @@ no_work_pending: ct_user_enter save = 0 +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + bl stackleak_erase_on_task_stack +#endif restore_user_regs fast = 0, offset = 0 ENDPROC(ret_to_user_from_irq) ENDPROC(ret_to_user) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 5a8a9b3429b5..59d3dccaa8e5 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -27,7 +27,8 @@ cflags-$(CONFIG_ARM64) += -fpie $(DISABLE_STACKLEAK_PLUGIN) \ cflags-$(CONFIG_ARM) += -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \ -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \ -DEFI_HAVE_STRCMP -fno-builtin -fpic \ - $(call cc-option,-mno-single-pic-base) + $(call cc-option,-mno-single-pic-base) \ + $(DISABLE_STACKLEAK_PLUGIN) cflags-$(CONFIG_RISCV) += -fpic -DNO_ALTERNATIVE -mno-relax cflags-$(CONFIG_LOONGARCH) += -fpie From 657a292d679ae3a6c733ab0e939e24ae44b20faf Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 27 Jun 2024 09:22:09 +0100 Subject: [PATCH 0008/1052] ARM: 9408/1: mm: CFI: Fix some erroneous reset prototypes I somehow got a few cpu_nn_reset() signatures wrong in my patch. Fix it up. Closes: https://lore.kernel.org/oe-kbuild-all/202406260432.6WGV2jCk-lkp@intel.com/ Fixes: 393999fa9627 ("ARM: 9389/2: mm: Define prototypes for all per-processor calls") Reported-by: kernel test robot Reported-by: Nathan Chancellor Reviewed-by: Sami Tolvanen Signed-off-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/mm/proc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm/mm/proc.c b/arch/arm/mm/proc.c index bdbbf65d1b36..2027845efefb 100644 --- a/arch/arm/mm/proc.c +++ b/arch/arm/mm/proc.c @@ -17,7 +17,7 @@ void cpu_arm7tdmi_proc_init(void); __ADDRESSABLE(cpu_arm7tdmi_proc_init); void cpu_arm7tdmi_proc_fin(void); __ADDRESSABLE(cpu_arm7tdmi_proc_fin); -void cpu_arm7tdmi_reset(void); +void cpu_arm7tdmi_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm7tdmi_reset); int cpu_arm7tdmi_do_idle(void); __ADDRESSABLE(cpu_arm7tdmi_do_idle); @@ -32,7 +32,7 @@ void cpu_arm720_proc_init(void); __ADDRESSABLE(cpu_arm720_proc_init); void cpu_arm720_proc_fin(void); __ADDRESSABLE(cpu_arm720_proc_fin); -void cpu_arm720_reset(void); +void cpu_arm720_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm720_reset); int cpu_arm720_do_idle(void); __ADDRESSABLE(cpu_arm720_do_idle); @@ -49,7 +49,7 @@ void cpu_arm740_proc_init(void); __ADDRESSABLE(cpu_arm740_proc_init); void cpu_arm740_proc_fin(void); __ADDRESSABLE(cpu_arm740_proc_fin); -void cpu_arm740_reset(void); +void cpu_arm740_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm740_reset); int cpu_arm740_do_idle(void); __ADDRESSABLE(cpu_arm740_do_idle); @@ -64,7 +64,7 @@ void cpu_arm9tdmi_proc_init(void); __ADDRESSABLE(cpu_arm9tdmi_proc_init); void cpu_arm9tdmi_proc_fin(void); __ADDRESSABLE(cpu_arm9tdmi_proc_fin); -void cpu_arm9tdmi_reset(void); +void cpu_arm9tdmi_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm9tdmi_reset); int cpu_arm9tdmi_do_idle(void); __ADDRESSABLE(cpu_arm9tdmi_do_idle); @@ -79,7 +79,7 @@ void cpu_arm920_proc_init(void); __ADDRESSABLE(cpu_arm920_proc_init); void cpu_arm920_proc_fin(void); __ADDRESSABLE(cpu_arm920_proc_fin); -void cpu_arm920_reset(void); +void cpu_arm920_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm920_reset); int cpu_arm920_do_idle(void); __ADDRESSABLE(cpu_arm920_do_idle); @@ -102,7 +102,7 @@ void cpu_arm922_proc_init(void); __ADDRESSABLE(cpu_arm922_proc_init); void cpu_arm922_proc_fin(void); __ADDRESSABLE(cpu_arm922_proc_fin); -void cpu_arm922_reset(void); +void cpu_arm922_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm922_reset); int cpu_arm922_do_idle(void); __ADDRESSABLE(cpu_arm922_do_idle); @@ -119,7 +119,7 @@ void cpu_arm925_proc_init(void); __ADDRESSABLE(cpu_arm925_proc_init); void cpu_arm925_proc_fin(void); __ADDRESSABLE(cpu_arm925_proc_fin); -void cpu_arm925_reset(void); +void cpu_arm925_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm925_reset); int cpu_arm925_do_idle(void); __ADDRESSABLE(cpu_arm925_do_idle); @@ -159,7 +159,7 @@ void cpu_arm940_proc_init(void); __ADDRESSABLE(cpu_arm940_proc_init); void cpu_arm940_proc_fin(void); __ADDRESSABLE(cpu_arm940_proc_fin); -void cpu_arm940_reset(void); +void cpu_arm940_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm940_reset); int cpu_arm940_do_idle(void); __ADDRESSABLE(cpu_arm940_do_idle); @@ -174,7 +174,7 @@ void cpu_arm946_proc_init(void); __ADDRESSABLE(cpu_arm946_proc_init); void cpu_arm946_proc_fin(void); __ADDRESSABLE(cpu_arm946_proc_fin); -void cpu_arm946_reset(void); +void cpu_arm946_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm946_reset); int cpu_arm946_do_idle(void); __ADDRESSABLE(cpu_arm946_do_idle); @@ -429,7 +429,7 @@ void cpu_v7_proc_init(void); __ADDRESSABLE(cpu_v7_proc_init); void cpu_v7_proc_fin(void); __ADDRESSABLE(cpu_v7_proc_fin); -void cpu_v7_reset(void); +void cpu_v7_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_v7_reset); int cpu_v7_do_idle(void); __ADDRESSABLE(cpu_v7_do_idle); From f666604321f1da2b9bd237ef8a1afdd47460e74b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 30 May 2024 00:47:22 +0100 Subject: [PATCH 0009/1052] USB: serial: spcp8x5: remove unused struct 'spcp8x5_usb_ctrl_arg' 'spcp8x5_usb_ctrl_arg' has been unused since the original commit 619a6f1d1423 ("USB: add usb-serial spcp8x5 driver"). Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Johan Hovold --- drivers/usb/serial/spcp8x5.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c index 09a972a838ee..6b294bf8bc43 100644 --- a/drivers/usb/serial/spcp8x5.c +++ b/drivers/usb/serial/spcp8x5.c @@ -49,16 +49,6 @@ static const struct usb_device_id id_table[] = { }; MODULE_DEVICE_TABLE(usb, id_table); -struct spcp8x5_usb_ctrl_arg { - u8 type; - u8 cmd; - u8 cmd_type; - u16 value; - u16 index; - u16 length; -}; - - /* spcp8x5 spec register define */ #define MCR_CONTROL_LINE_RTS 0x02 #define MCR_CONTROL_LINE_DTR 0x01 From 9f4dc05107a6db3743e6b9ea4014cbdc3795682d Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 11 Jun 2024 10:52:54 -0700 Subject: [PATCH 0010/1052] USB: serial: add missing MODULE_DESCRIPTION() macros Since commit 1fffe7a34c89 ("script: modpost: emit a warning when the description is missing"), ARCH=x86 make allmodconfig && make W=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/ch341.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/usb_debug.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/mxuport.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/navman.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/qcaux.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/usb-serial-simple.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/symbolserial.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson [ johan: amend commit message with commit introducing W=1 warning; tweak some descriptions ] Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 1 + drivers/usb/serial/mxuport.c | 1 + drivers/usb/serial/navman.c | 1 + drivers/usb/serial/qcaux.c | 1 + drivers/usb/serial/symbolserial.c | 1 + drivers/usb/serial/usb-serial-simple.c | 1 + drivers/usb/serial/usb_debug.c | 1 + 7 files changed, 7 insertions(+) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 612bea504d7a..0870c6533f80 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -863,4 +863,5 @@ static struct usb_serial_driver * const serial_drivers[] = { module_usb_serial_driver(serial_drivers, id_table); +MODULE_DESCRIPTION("Winchiphead CH341 USB Serial driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/usb/serial/mxuport.c b/drivers/usb/serial/mxuport.c index 1f7bb3e4fcf2..942cb0153423 100644 --- a/drivers/usb/serial/mxuport.c +++ b/drivers/usb/serial/mxuport.c @@ -1315,4 +1315,5 @@ module_usb_serial_driver(serial_drivers, mxuport_idtable); MODULE_AUTHOR("Andrew Lunn "); MODULE_AUTHOR(""); +MODULE_DESCRIPTION("Moxa UPORT USB Serial driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/usb/serial/navman.c b/drivers/usb/serial/navman.c index 20277c52dded..82791fd67c46 100644 --- a/drivers/usb/serial/navman.c +++ b/drivers/usb/serial/navman.c @@ -112,4 +112,5 @@ static struct usb_serial_driver * const serial_drivers[] = { module_usb_serial_driver(serial_drivers, id_table); +MODULE_DESCRIPTION("Navman USB Serial driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/usb/serial/qcaux.c b/drivers/usb/serial/qcaux.c index 929ffba663f2..015bb7c5d19d 100644 --- a/drivers/usb/serial/qcaux.c +++ b/drivers/usb/serial/qcaux.c @@ -84,4 +84,5 @@ static struct usb_serial_driver * const serial_drivers[] = { }; module_usb_serial_driver(serial_drivers, id_table); +MODULE_DESCRIPTION("Qualcomm USB Auxiliary Serial Port driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c index d7f73ad6e778..9aabb087f733 100644 --- a/drivers/usb/serial/symbolserial.c +++ b/drivers/usb/serial/symbolserial.c @@ -190,4 +190,5 @@ static struct usb_serial_driver * const serial_drivers[] = { module_usb_serial_driver(serial_drivers, id_table); +MODULE_DESCRIPTION("Symbol USB barcode to serial driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c index 24b8772a345e..82f4f0b992aa 100644 --- a/drivers/usb/serial/usb-serial-simple.c +++ b/drivers/usb/serial/usb-serial-simple.c @@ -163,4 +163,5 @@ static const struct usb_device_id id_table[] = { MODULE_DEVICE_TABLE(usb, id_table); module_usb_serial_driver(serial_drivers, id_table); +MODULE_DESCRIPTION("USB Serial 'Simple' driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c index 6934970f180d..8188776b57d1 100644 --- a/drivers/usb/serial/usb_debug.c +++ b/drivers/usb/serial/usb_debug.c @@ -104,4 +104,5 @@ static struct usb_serial_driver * const serial_drivers[] = { }; module_usb_serial_driver(serial_drivers, id_table_combined); +MODULE_DESCRIPTION("USB Debug cable driver"); MODULE_LICENSE("GPL v2"); From 55a15b3a713a3f24360cf9d8dcfd2a3e337321d6 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Wed, 19 Jun 2024 21:42:44 +0200 Subject: [PATCH 0011/1052] USB: serial: garmin_gps: annotate struct garmin_packet with __counted_by Use the __counted_by compiler attribute for the data[] flexible array member to improve the results of array bound sanitizers. Reviewed-by: Nathan Chancellor Signed-off-by: Javier Carrasco Reviewed-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Johan Hovold --- drivers/usb/serial/garmin_gps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index 670e942fdaaa..57df6ad183ff 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -104,7 +104,7 @@ struct garmin_packet { int seq; /* the real size of the data array, always > 0 */ int size; - __u8 data[]; + __u8 data[] __counted_by(size); }; /* structure used to keep the current state of the driver */ From df8c0b8a03e871431587a13a6765cb4c601e1573 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Wed, 19 Jun 2024 21:42:45 +0200 Subject: [PATCH 0012/1052] USB: serial: garmin_gps: use struct_size() to allocate pkt Use the struct_size macro to calculate the size of the pkt, which includes a trailing flexible array. Suggested-by: Nathan Chancellor Signed-off-by: Javier Carrasco Reviewed-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Johan Hovold --- drivers/usb/serial/garmin_gps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index 57df6ad183ff..6d6ec7eed87c 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -267,8 +267,7 @@ static int pkt_add(struct garmin_data *garmin_data_p, /* process only packets containing data ... */ if (data_length) { - pkt = kmalloc(sizeof(struct garmin_packet)+data_length, - GFP_ATOMIC); + pkt = kmalloc(struct_size(pkt, data, data_length), GFP_ATOMIC); if (!pkt) return 0; From 833cd3e9ad8360785b6c23c82dd3856df00732d9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Wed, 3 Jul 2024 22:17:37 +0800 Subject: [PATCH 0013/1052] drm/fb-helper: Don't schedule_work() to flush frame buffer during panic() Sometimes the system [1] hangs on x86 I/O machine checks. However, the expected behavior is to reboot the system, as the machine check handler ultimately triggers a panic(), initiating a reboot in the last step. The root cause is that sometimes the panic() is blocked when drm_fb_helper_damage() invoking schedule_work() to flush the frame buffer. This occurs during the process of flushing all messages to the frame buffer driver as shown in the following call trace: Machine check occurs [2]: panic() console_flush_on_panic() console_flush_all() console_emit_next_record() con->write() vt_console_print() hide_cursor() vc->vc_sw->con_cursor() fbcon_cursor() ops->cursor() bit_cursor() soft_cursor() info->fbops->fb_imageblit() drm_fbdev_generic_defio_imageblit() drm_fb_helper_damage_area() drm_fb_helper_damage() schedule_work() // <--- blocked here ... emergency_restart() // wasn't invoked, so no reboot. During panic(), except the panic CPU, all the other CPUs are stopped. In schedule_work(), the panic CPU requires the lock of worker_pool to queue the work on that pool, while the lock may have been token by some other stopped CPU. So schedule_work() is blocked. Additionally, during a panic(), since there is no opportunity to execute any scheduled work, it's safe to fix this issue by skipping schedule_work() on 'oops_in_progress' in drm_fb_helper_damage(). [1] Enable the kernel option CONFIG_FRAMEBUFFER_CONSOLE, CONFIG_DRM_FBDEV_EMULATION, and boot with the 'console=tty0' kernel command line parameter. [2] Set 'panic_timeout' to a non-zero value before calling panic(). Acked-by: Thomas Zimmermann Reported-by: Yudong Wang Tested-by: Yudong Wang Signed-off-by: Qiuxu Zhuo Link: https://patchwork.freedesktop.org/patch/msgid/20240703141737.75378-1-qiuxu.zhuo@intel.com Signed-off-by: Maarten Lankhorst,,, --- drivers/gpu/drm/drm_fb_helper.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 117237d3528b..618b04523033 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -631,6 +631,17 @@ static void drm_fb_helper_add_damage_clip(struct drm_fb_helper *helper, u32 x, u static void drm_fb_helper_damage(struct drm_fb_helper *helper, u32 x, u32 y, u32 width, u32 height) { + /* + * This function may be invoked by panic() to flush the frame + * buffer, where all CPUs except the panic CPU are stopped. + * During the following schedule_work(), the panic CPU needs + * the worker_pool lock, which might be held by a stopped CPU, + * causing schedule_work() and panic() to block. Return early on + * oops_in_progress to prevent this blocking. + */ + if (oops_in_progress) + return; + drm_fb_helper_add_damage_clip(helper, x, y, width, height); schedule_work(&helper->damage_work); From 84f78178b6fe37b5eb8b90b5bb1239abce0b64d8 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Wed, 10 Jul 2024 10:36:13 -0500 Subject: [PATCH 0014/1052] arm64: dts: ti: k3-j784s4-evm: Assign only lanes 0 and 1 to PCIe1 Currently PCIe1 is setup to use SERDES0 lanes 0 thru 3, and USB0 is setup to use SERDES0 lane 3 as well. This overlap in lanes causes the following reset related lane splat: [ 4.846266] WARNING: CPU: 4 PID: 308 at drivers/reset/core.c:792 __reset_control_get_internal+0x128/0x160 ... [ 4.846405] Call trace: [ 4.846407] __reset_control_get_internal+0x128/0x160 [ 4.846413] __of_reset_control_get+0x4e0/0x528 [ 4.846418] of_reset_control_array_get+0xa4/0x1f8 [ 4.846423] cdns_torrent_phy_probe+0xbc8/0x1068 [phy_cadence_torrent] [ 4.846445] platform_probe+0xb4/0xe8 ... [ 4.846577] cdns-torrent-phy 5060000.serdes: phy@0: failed to get reset Let's limit the PCIe1 SERDES0 lanes to 0 and 1 to avoid overlap here. This works since PCIe1 operates in x2 mode and doesn't need 4 SERDES0 lanes. Fixes: 27ce26fe52d4 ("arm64: dts: ti: k3-j784s4-evm: Enable PCIe0 and PCIe1 in RC Mode") Suggested-by: Siddharth Vadapalli Signed-off-by: Andrew Halaney Reviewed-by: Siddharth Vadapalli Link: https://lore.kernel.org/r/20240710-k3-j784s4-evm-serdes0-cleanup-v1-1-03850fe33922@redhat.com Signed-off-by: Vignesh Raghavendra --- arch/arm64/boot/dts/ti/k3-j784s4-evm.dts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts b/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts index 9338d987180d..e54ccf4f3795 100644 --- a/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts +++ b/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts @@ -1391,11 +1391,10 @@ &serdes0 { serdes0_pcie1_link: phy@0 { reg = <0>; - cdns,num-lanes = <4>; + cdns,num-lanes = <2>; #phy-cells = <0>; cdns,phy-type = ; - resets = <&serdes_wiz0 1>, <&serdes_wiz0 2>, - <&serdes_wiz0 3>, <&serdes_wiz0 4>; + resets = <&serdes_wiz0 1>, <&serdes_wiz0 2>; }; }; From cc5049007d722364bca4a4eeb619d5629733a004 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Wed, 10 Jul 2024 10:36:14 -0500 Subject: [PATCH 0015/1052] arm64: dts: ti: k3-j784s4-evm: Consolidate serdes0 references Subnodes were added to serdes0 in two different spots (due to independent development of their consumer usage). Let's go ahead and combine those into one reference for readability's sake. Signed-off-by: Andrew Halaney Reviewed-by: Siddharth Vadapalli Link: https://lore.kernel.org/r/20240710-k3-j784s4-evm-serdes0-cleanup-v1-2-03850fe33922@redhat.com Signed-off-by: Vignesh Raghavendra --- arch/arm64/boot/dts/ti/k3-j784s4-evm.dts | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts b/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts index e54ccf4f3795..ffa38f41679d 100644 --- a/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts +++ b/arch/arm64/boot/dts/ti/k3-j784s4-evm.dts @@ -1262,6 +1262,14 @@ &dss { &serdes0 { status = "okay"; + serdes0_pcie1_link: phy@0 { + reg = <0>; + cdns,num-lanes = <2>; + #phy-cells = <0>; + cdns,phy-type = ; + resets = <&serdes_wiz0 1>, <&serdes_wiz0 2>; + }; + serdes0_usb_link: phy@3 { reg = <3>; cdns,num-lanes = <1>; @@ -1386,22 +1394,6 @@ &main_mcan4 { phys = <&transceiver3>; }; -&serdes0 { - status = "okay"; - - serdes0_pcie1_link: phy@0 { - reg = <0>; - cdns,num-lanes = <2>; - #phy-cells = <0>; - cdns,phy-type = ; - resets = <&serdes_wiz0 1>, <&serdes_wiz0 2>; - }; -}; - -&serdes_wiz0 { - status = "okay"; -}; - &pcie1_rc { status = "okay"; num-lanes = <2>; From d60c429610a14560085d98fa6f4cdb43040ca8f0 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Mon, 15 Jul 2024 11:57:49 +0700 Subject: [PATCH 0016/1052] drm: panel-orientation-quirks: Add quirk for OrangePi Neo This adds a DMI orientation quirk for the OrangePi Neo Linux Gaming Handheld. Signed-off-by: Philip Mueller Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20240715045818.1019979-1-philm@manjaro.org --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index 3860a8ce1e2d..903f4bfea7e8 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -414,6 +414,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "ONE XPLAYER"), }, .driver_data = (void *)&lcd1600x2560_leftside_up, + }, { /* OrangePi Neo */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "OrangePi"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "NEO-01"), + }, + .driver_data = (void *)&lcd1200x1920_rightside_up, }, { /* Samsung GalaxyBook 10.6 */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), From 0c60eb0cc320fffbb8b10329d276af14f6f5e6bf Mon Sep 17 00:00:00 2001 From: Kyoungrul Kim Date: Wed, 10 Jul 2024 08:25:20 +0900 Subject: [PATCH 0017/1052] scsi: ufs: core: Check LSDBS cap when !mcq If the user sets use_mcq_mode to 0, the host will try to activate the LSDB mode unconditionally even when the LSDBS of device HCI cap is 1. This makes commands time out and causes device probing to fail. To prevent that problem, check the LSDBS cap when MCQ is not supported. Signed-off-by: Kyoungrul Kim Link: https://lore.kernel.org/r/20240709232520epcms2p8ebdb5c4fccc30a6221390566589bf122@epcms2p8 Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 16 ++++++++++++++++ include/ufs/ufshcd.h | 1 + include/ufs/ufshci.h | 1 + 3 files changed, 18 insertions(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index bea00e069e9a..480be64acf24 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -2416,7 +2416,17 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba) return err; } + /* + * The UFSHCI 3.0 specification does not define MCQ_SUPPORT and + * LSDB_SUPPORT, but [31:29] as reserved bits with reset value 0s, which + * means we can simply read values regardless of version. + */ hba->mcq_sup = FIELD_GET(MASK_MCQ_SUPPORT, hba->capabilities); + /* + * 0h: legacy single doorbell support is available + * 1h: indicate that legacy single doorbell support has been removed + */ + hba->lsdb_sup = !FIELD_GET(MASK_LSDB_SUPPORT, hba->capabilities); if (!hba->mcq_sup) return 0; @@ -10494,6 +10504,12 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) } if (!is_mcq_supported(hba)) { + if (!hba->lsdb_sup) { + dev_err(hba->dev, "%s: failed to initialize (legacy doorbell mode not supported)\n", + __func__); + err = -EINVAL; + goto out_disable; + } err = scsi_add_host(host, hba->dev); if (err) { dev_err(hba->dev, "scsi_add_host failed\n"); diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index a43b14276bc3..cac0cdb9a916 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1109,6 +1109,7 @@ struct ufs_hba { bool ext_iid_sup; bool scsi_host_added; bool mcq_sup; + bool lsdb_sup; bool mcq_enabled; struct ufshcd_res_info res[RES_MAX]; void __iomem *mcq_base; diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h index 38fe97971a65..9917c7743d80 100644 --- a/include/ufs/ufshci.h +++ b/include/ufs/ufshci.h @@ -77,6 +77,7 @@ enum { MASK_OUT_OF_ORDER_DATA_DELIVERY_SUPPORT = 0x02000000, MASK_UIC_DME_TEST_MODE_SUPPORT = 0x04000000, MASK_CRYPTO_SUPPORT = 0x10000000, + MASK_LSDB_SUPPORT = 0x20000000, MASK_MCQ_SUPPORT = 0x40000000, }; From 022587d8aec3da1d1698ddae9fb8cfe35f3ad49c Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Fri, 12 Jul 2024 17:45:06 +0800 Subject: [PATCH 0018/1052] scsi: ufs: core: Bypass quick recovery if force reset is needed If force_reset is true, bypass quick recovery. This will shorten error recovery time. Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20240712094506.11284-1-peter.wang@mediatek.com Reviewed-by: Bean Huo Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 480be64acf24..6ecaa7d8fa43 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -6561,7 +6561,8 @@ static void ufshcd_err_handler(struct work_struct *work) if (ufshcd_err_handling_should_stop(hba)) goto skip_err_handling; - if (hba->dev_quirks & UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS) { + if ((hba->dev_quirks & UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS) && + !hba->force_reset) { bool ret; spin_unlock_irqrestore(hba->host->host_lock, flags); From 3911af778f208e5f49d43ce739332b91e26bc48e Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Mon, 15 Jul 2024 14:38:31 +0800 Subject: [PATCH 0019/1052] scsi: ufs: core: Fix deadlock during RTC update There is a deadlock when runtime suspend waits for the flush of RTC work, and the RTC work calls ufshcd_rpm_get_sync() to wait for runtime resume. Here is deadlock backtrace: kworker/0:1 D 4892.876354 10 10971 4859 0x4208060 0x8 10 0 120 670730152367 ptr f0ffff80c2e40000 0 1 0x00000001 0x000000ff 0x000000ff 0x000000ff __switch_to+0x1a8/0x2d4 __schedule+0x684/0xa98 schedule+0x48/0xc8 schedule_timeout+0x48/0x170 do_wait_for_common+0x108/0x1b0 wait_for_completion+0x44/0x60 __flush_work+0x39c/0x424 __cancel_work_sync+0xd8/0x208 cancel_delayed_work_sync+0x14/0x28 __ufshcd_wl_suspend+0x19c/0x480 ufshcd_wl_runtime_suspend+0x3c/0x1d4 scsi_runtime_suspend+0x78/0xc8 __rpm_callback+0x94/0x3e0 rpm_suspend+0x2d4/0x65c __pm_runtime_suspend+0x80/0x114 scsi_runtime_idle+0x38/0x6c rpm_idle+0x264/0x338 __pm_runtime_idle+0x80/0x110 ufshcd_rtc_work+0x128/0x1e4 process_one_work+0x26c/0x650 worker_thread+0x260/0x3d8 kthread+0x110/0x134 ret_from_fork+0x10/0x20 Skip updating RTC if RPM state is not RPM_ACTIVE. Fixes: 6bf999e0eb41 ("scsi: ufs: core: Add UFS RTC support") Cc: stable@vger.kernel.org # 6.9.x Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20240715063831.29792-1-peter.wang@mediatek.com Reviewed-by: Bean Huo Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd-priv.h | 5 +++++ drivers/ufs/core/ufshcd.c | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h index ce36154ce963..7aea8fbaeee8 100644 --- a/drivers/ufs/core/ufshcd-priv.h +++ b/drivers/ufs/core/ufshcd-priv.h @@ -316,6 +316,11 @@ static inline int ufshcd_rpm_get_sync(struct ufs_hba *hba) return pm_runtime_get_sync(&hba->ufs_device_wlun->sdev_gendev); } +static inline int ufshcd_rpm_get_if_active(struct ufs_hba *hba) +{ + return pm_runtime_get_if_active(&hba->ufs_device_wlun->sdev_gendev); +} + static inline int ufshcd_rpm_put_sync(struct ufs_hba *hba) { return pm_runtime_put_sync(&hba->ufs_device_wlun->sdev_gendev); diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 6ecaa7d8fa43..e2bc3553f7df 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8220,7 +8220,10 @@ static void ufshcd_update_rtc(struct ufs_hba *hba) */ val = ts64.tv_sec - hba->dev_info.rtc_time_baseline; - ufshcd_rpm_get_sync(hba); + /* Skip update RTC if RPM state is not RPM_ACTIVE */ + if (ufshcd_rpm_get_if_active(hba) <= 0) + return; + err = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR, QUERY_ATTR_IDN_SECONDS_PASSED, 0, 0, &val); ufshcd_rpm_put_sync(hba); From eeb1f825b5dc68047a0556e5ae86d1467920db41 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 15 Jul 2024 15:51:33 +0200 Subject: [PATCH 0020/1052] drm/gpuvm: fix missing dependency to DRM_EXEC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 50c1a36f594b ("drm/gpuvm: track/lock/validate external/evicted objects") we started using drm_exec, but did not select DRM_EXEC in the Kconfig for DRM_GPUVM, fix this. Cc: Christian König Cc: Boris Brezillon Cc: Thomas Hellström Fixes: 50c1a36f594b ("drm/gpuvm: track/lock/validate external/evicted objects") Signed-off-by: Danilo Krummrich Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20240715135158.133287-1-dakr@redhat.com --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index d0aa277fc3bf..d08d79bbb0f6 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -254,6 +254,7 @@ config DRM_EXEC config DRM_GPUVM tristate depends on DRM + select DRM_EXEC help GPU-VM representation providing helpers to manage a GPUs virtual address space From 69eced9eb49aa6a55c9ef5745b0826fe0b74ff0f Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 13:25:28 -0700 Subject: [PATCH 0021/1052] virtio: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/virtio/virtio_dma_buf.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Message-Id: <20240602-md-virtio_dma_buf-v1-1-ce602d47e257@quicinc.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_dma_buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c index 2521a75009c3..3034a2f605c8 100644 --- a/drivers/virtio/virtio_dma_buf.c +++ b/drivers/virtio/virtio_dma_buf.c @@ -85,5 +85,6 @@ int virtio_dma_buf_get_uuid(struct dma_buf *dma_buf, } EXPORT_SYMBOL(virtio_dma_buf_get_uuid); +MODULE_DESCRIPTION("dma-bufs for virtio exported objects"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS(DMA_BUF); From e14f480df7fbcb0c84ce9142c861701495d9b6e2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 12 Jul 2024 09:06:12 -0500 Subject: [PATCH 0022/1052] vdpa/octeon_ep: Fix error code in octep_process_mbox() Return -EINVAL for invalid signatures. Don't return success. Fixes: 8b6c724cdab8 ("virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices") Signed-off-by: Dan Carpenter Message-Id: <623e885b-1a05-479e-ab97-01bcf10bf5b8@stanley.mountain> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/octeon_ep/octep_vdpa_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c index 7fa0491bb201..11bd76ae18cf 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c @@ -140,7 +140,7 @@ static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *bu val = octep_read_sig(mbox); if ((val & 0xFFFF) != MBOX_RSP_SIG) { dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id); - return ret; + return -EINVAL; } val = octep_read_sts(mbox); From de128f3f6317b8568311d9f9eb5e0425ebc23a03 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:40 +0200 Subject: [PATCH 0023/1052] virtio_pci: push out single vq find code to vp_find_one_vq_msix() In order to be reused for admin queue setup, push out common code to setup and configure irq for one vq into a separate helper. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-2-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 68 ++++++++++++++++++------------ 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7d82facafd75..277591a6ab96 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -284,6 +284,44 @@ void vp_del_vqs(struct virtio_device *vdev) vp_dev->vqs = NULL; } +static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, + int queue_idx, + vq_callback_t *callback, + const char *name, bool ctx, + int *allocated_vectors) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq; + u16 msix_vec; + int err; + + if (!callback) + msix_vec = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + msix_vec = (*allocated_vectors)++; + else + msix_vec = VP_MSIX_VQ_VECTOR; + vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec); + if (IS_ERR(vq)) + return vq; + + if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) + return vq; + + /* allocate per-vq irq if available and necessary */ + snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), + "%s-%s", dev_name(&vp_dev->vdev.dev), name); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), + vring_interrupt, 0, + vp_dev->msix_names[msix_vec], vq); + if (err) { + vp_del_vq(vq); + return ERR_PTR(err); + } + + return vq; +} + static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], @@ -292,7 +330,6 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue_info *vqi; - u16 msix_vec; int i, err, nvectors, allocated_vectors, queue_idx = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); @@ -325,36 +362,13 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, vqs[i] = NULL; continue; } - - if (!vqi->callback) - msix_vec = VIRTIO_MSI_NO_VECTOR; - else if (vp_dev->per_vq_vectors) - msix_vec = allocated_vectors++; - else - msix_vec = VP_MSIX_VQ_VECTOR; - vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, - vqi->name, vqi->ctx, msix_vec); + vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, + vqi->name, vqi->ctx, + &allocated_vectors); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } - - if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) - continue; - - /* allocate per-vq irq if available and necessary */ - snprintf(vp_dev->msix_names[msix_vec], - sizeof *vp_dev->msix_names, - "%s-%s", - dev_name(&vp_dev->vdev.dev), vqi->name); - err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), - vring_interrupt, 0, - vp_dev->msix_names[msix_vec], - vqs[i]); - if (err) { - vp_del_vq(vqs[i]); - goto error_find; - } } return 0; From 572864d45d61c69aa99461e944f43d3b95c5e159 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:41 +0200 Subject: [PATCH 0024/1052] virtio_pci: simplify vp_request_msix_vectors() call a bit Pass desc arg as it is always to vp_request_msix_vectors(). There rely on per_vq_vectors arg and null desc in case it is false. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-3-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 277591a6ab96..cda0d35dd9e1 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -125,6 +125,9 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, GFP_KERNEL)) goto error; + if (!per_vq_vectors) + desc = NULL; + if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ @@ -349,8 +352,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, nvectors = 2; } - err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, - per_vq_vectors ? desc : NULL); + err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; From ef775b2627c83f7c521c38fdcf972567c55e4550 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:42 +0200 Subject: [PATCH 0025/1052] virtio_pci: pass vector policy enum to vp_find_vqs_msix() In preparation for another irq allocation fallback, introduce vector policy enum and pass the values to vp_find_vqs_msix() instead of bool arg. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-4-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index cda0d35dd9e1..7fec2258d125 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -287,6 +287,11 @@ void vp_del_vqs(struct virtio_device *vdev) vp_dev->vqs = NULL; } +enum vp_vq_vector_policy { + VP_VQ_VECTOR_POLICY_EACH, + VP_VQ_VECTOR_POLICY_SHARED, +}; + static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, @@ -328,17 +333,20 @@ static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], - bool per_vq_vectors, + enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; + bool per_vq_vectors; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; + per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; + if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; @@ -427,11 +435,13 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, int err; /* Try MSI-X with one vector per queue. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, true, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, + VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, false, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, + VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ From b8b4e1a05d41c282e63a1772f00e376f6fb84410 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:43 +0200 Subject: [PATCH 0026/1052] virtio_pci: pass vector policy enum to vp_find_one_vq_msix() Instead of accessing vp_dev->per_vq_vectors, pass vector policy enum as an argument of vp_find_one_vq_msix() in preparation for another irq allocation policy. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-5-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7fec2258d125..628f003db79b 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -292,11 +292,11 @@ enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_SHARED, }; -static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, - int queue_idx, - vq_callback_t *callback, - const char *name, bool ctx, - int *allocated_vectors) +static struct virtqueue * +vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, + vq_callback_t *callback, const char *name, bool ctx, + int *allocated_vectors, + enum vp_vq_vector_policy vector_policy) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; @@ -305,7 +305,7 @@ static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; - else if (vp_dev->per_vq_vectors) + else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH) msix_vec = (*allocated_vectors)++; else msix_vec = VP_MSIX_VQ_VECTOR; @@ -313,7 +313,8 @@ static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, if (IS_ERR(vq)) return vq; - if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) + if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || + msix_vec == VIRTIO_MSI_NO_VECTOR) return vq; /* allocate per-vq irq if available and necessary */ @@ -374,7 +375,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, - &allocated_vectors); + &allocated_vectors, vector_policy); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; From 06909a4427d6d57757c0a97a4a4be0aea363caab Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:44 +0200 Subject: [PATCH 0027/1052] virtio_pci: introduce vector allocation fallback for slow path virtqueues If there is not enough vectors to satisfy all virtqueues, currently the fallback is to use one vector for all virtqueues. That may be unnecessary in some cases, when there is enough vectors per data queues. Introduce another fallback policy that tries to allocate vector for all data queues, however for slow path queues (control/admin) it shares config vector. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-6-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 53 ++++++++++++++++++++++++++---- drivers/virtio/virtio_pci_common.h | 7 ++-- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 628f003db79b..cb8bbe6c1a9f 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -46,12 +46,26 @@ bool vp_notify(struct virtqueue *vq) return true; } +/* Notify all slow path virtqueues on an interrupt. */ +static void vp_vring_slow_path_interrupt(int irq, + struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_vq_info *info; + unsigned long flags; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_for_each_entry(info, &vp_dev->slow_virtqueues, node) + vring_interrupt(irq, info->vq); + spin_unlock_irqrestore(&vp_dev->lock, flags); +} + /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); + vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } @@ -174,6 +188,11 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, return err; } +static bool vp_is_slow_path_vector(u16 msix_vec) +{ + return msix_vec == VP_MSIX_CONFIG_VECTOR; +} + static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, @@ -197,7 +216,10 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); - list_add(&info->node, &vp_dev->virtqueues); + if (!vp_is_slow_path_vector(msix_vec)) + list_add(&info->node, &vp_dev->virtqueues); + else + list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); @@ -245,7 +267,8 @@ void vp_del_vqs(struct virtio_device *vdev) if (vp_dev->per_vq_vectors) { int v = vp_dev->vqs[vq->index]->msix_vector; - if (v != VIRTIO_MSI_NO_VECTOR) { + if (v != VIRTIO_MSI_NO_VECTOR && + !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); @@ -289,13 +312,14 @@ void vp_del_vqs(struct virtio_device *vdev) enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, + VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, - int *allocated_vectors, + bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -305,8 +329,13 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; - else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH) + else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || + (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && + !slow_path)) msix_vec = (*allocated_vectors)++; + else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && + slow_path) + msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec); @@ -314,7 +343,8 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || - msix_vec == VIRTIO_MSI_NO_VECTOR) + msix_vec == VIRTIO_MSI_NO_VECTOR || + vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ @@ -374,7 +404,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, - vqi->name, vqi->ctx, + vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); @@ -440,6 +470,13 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; + /* Fallback: MSI-X with one shared vector for config and + * slow path queues, one vector per queue for the rest. + */ + err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, + VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); + if (!err) + return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); @@ -493,7 +530,8 @@ const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || - vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR) + vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || + vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, @@ -601,6 +639,7 @@ static int virtio_pci_probe(struct pci_dev *pci_dev, vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); + INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 3c4bb2d6163a..0cb6fdc8c354 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -35,7 +35,7 @@ struct virtio_pci_vq_info { /* the actual virtqueue */ struct virtqueue *vq; - /* the list node for the virtqueues list */ + /* the list node for the virtqueues or slow_virtqueues list */ struct list_head node; /* MSI-X vector (or none) */ @@ -66,9 +66,12 @@ struct virtio_pci_device { /* Where to read and clear interrupt */ u8 __iomem *isr; - /* a list of queues so we can dispatch IRQs */ + /* Lists of queues and potentially slow path queues + * so we can dispatch IRQs. + */ spinlock_t lock; struct list_head virtqueues; + struct list_head slow_virtqueues; /* Array of all virtqueues reported in the * PCI common config num_queues field From 4199107e39122e49bbddafb50932943127b63aee Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:45 +0200 Subject: [PATCH 0028/1052] virtio_pci_modern: treat vp_dev->admin_vq.info.vq pointer as static It is guaranteed by the virtio_pci and PCI layers that none of the VFs is probed before setup_vq() is called for admin queue and after del_vq() is called for admin queue. Therefore treat vp_dev->admin_vq.info.vq as static, don't null it and don't take cmd lock during assign. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-7-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.h | 2 +- drivers/virtio/virtio_pci_modern.c | 11 +---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 0cb6fdc8c354..aa43875b6353 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -45,7 +45,7 @@ struct virtio_pci_vq_info { struct virtio_pci_admin_vq { /* Virtqueue info associated with this admin queue. */ struct virtio_pci_vq_info info; - /* serializing admin commands execution and virtqueue deletion */ + /* serializing admin commands execution. */ struct mutex cmd_lock; u64 supported_cmds; /* Name of the admin queue: avq.$vq_index. */ diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 3b5b9499a53a..6d653bea9a87 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -580,11 +580,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, goto err; } - if (is_avq) { - mutex_lock(&vp_dev->admin_vq.cmd_lock); + if (is_avq) vp_dev->admin_vq.info.vq = vq; - mutex_unlock(&vp_dev->admin_vq.cmd_lock); - } return vq; @@ -620,12 +617,6 @@ static void del_vq(struct virtio_pci_vq_info *info) struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - if (vp_is_avq(&vp_dev->vdev, vq->index)) { - mutex_lock(&vp_dev->admin_vq.cmd_lock); - vp_dev->admin_vq.info.vq = NULL; - mutex_unlock(&vp_dev->admin_vq.cmd_lock); - } - if (vp_dev->msix_enabled) vp_modern_queue_vector(mdev, vq->index, VIRTIO_MSI_NO_VECTOR); From 7e5d9f556edca687acb04f171af1061042a35927 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:46 +0200 Subject: [PATCH 0029/1052] virtio: push out code to vp_avq_index() To prepare for the follow-up patch to use the code as an op, push out the code that gets admin virtqueue base index and count to a separate helper. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-8-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 31 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 6d653bea9a87..d704947b1aec 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -28,6 +28,21 @@ static u64 vp_get_features(struct virtio_device *vdev) return vp_modern_get_features(&vp_dev->mdev); } +static int vp_avq_index(struct virtio_device *vdev, u16 *index, u16 *num) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + *num = 0; + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return 0; + + *num = vp_modern_avq_num(&vp_dev->mdev); + if (!(*num)) + return -EINVAL; + *index = vp_modern_avq_index(&vp_dev->mdev); + return 0; +} + static bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -729,19 +744,15 @@ static bool vp_get_shm_region(struct virtio_device *vdev, static int vp_modern_create_avq(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *avq; + struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue *vq; - u16 admin_q_num; + u16 num; + int err; - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) - return 0; + err = vp_avq_index(vdev, &avq->vq_index, &num); + if (err || !num) + return err; - admin_q_num = vp_modern_avq_num(&vp_dev->mdev); - if (!admin_q_num) - return -EINVAL; - - avq = &vp_dev->admin_vq; - avq->vq_index = vp_modern_avq_index(&vp_dev->mdev); sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_dev->setup_vq(vp_dev, &vp_dev->admin_vq.info, avq->vq_index, NULL, avq->name, NULL, VIRTIO_MSI_NO_VECTOR); From 89a1c435aec269d109ed46ca7bbb10fa8edf7ace Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:47 +0200 Subject: [PATCH 0030/1052] virtio_pci: pass vq info as an argument to vp_setup_vq() Instead vp_setup_vq() storing vq info directly to vp_dev->vqs, let the caller provide a pointer to store the info to. This prepares vp_setup_vq() to be able to store admin queue info as well. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-9-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index cb8bbe6c1a9f..7c5516ae1f8a 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -197,7 +197,8 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in void (*callback)(struct virtqueue *vq), const char *name, bool ctx, - u16 msix_vec) + u16 msix_vec, + struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); @@ -225,7 +226,7 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in INIT_LIST_HEAD(&info->node); } - vp_dev->vqs[index] = info; + *p_info = info; return vq; out_info: @@ -320,7 +321,8 @@ static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, - enum vp_vq_vector_policy vector_policy) + enum vp_vq_vector_policy vector_policy, + struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; @@ -338,7 +340,8 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; - vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec); + vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, + p_info); if (IS_ERR(vq)) return vq; @@ -405,7 +408,8 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, - &allocated_vectors, vector_policy); + &allocated_vectors, vector_policy, + &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; @@ -445,7 +449,7 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, - VIRTIO_MSI_NO_VECTOR); + VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; From af22bbe1f4a514c80b89a27252beef033168f4e9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:48 +0200 Subject: [PATCH 0031/1052] virtio: create admin queues alongside other virtqueues Admin virtqueue is just another virtqueue nothing that special about it. The current implementation treats it somehow separate though in terms of creation and deletion. Unify the admin virtqueue creation and deletion flows to be aligned with the rest of virtqueues, creating it from vp_find_vqs_*() helpers. Let the admin virtqueue to be deleted by vp_del_vqs() as the rest. Call vp_find_one_vq_msix() with slow_path argument being "true" to make sure that in case of limited interrupt vectors the config vector is used for admin queue. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-10-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 28 +------------ drivers/virtio/virtio_pci_common.c | 43 ++++++++++++++++++-- drivers/virtio/virtio_pci_common.h | 4 +- drivers/virtio/virtio_pci_modern.c | 63 +----------------------------- include/linux/virtio_config.h | 4 -- 5 files changed, 46 insertions(+), 96 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 396d3cd49a1b..835cfbcb59c8 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -305,15 +305,9 @@ static int virtio_dev_probe(struct device *_d) if (err) goto err; - if (dev->config->create_avq) { - err = dev->config->create_avq(dev); - if (err) - goto err; - } - err = drv->probe(dev); if (err) - goto err_probe; + goto err; /* If probe didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) @@ -326,9 +320,6 @@ static int virtio_dev_probe(struct device *_d) return 0; -err_probe: - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; @@ -344,9 +335,6 @@ static void virtio_dev_remove(struct device *_d) drv->remove(dev); - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); - /* Driver should have reset device. */ WARN_ON_ONCE(dev->config->get_status(dev)); @@ -524,9 +512,6 @@ int virtio_device_freeze(struct virtio_device *dev) } } - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); - return 0; } EXPORT_SYMBOL_GPL(virtio_device_freeze); @@ -562,16 +547,10 @@ int virtio_device_restore(struct virtio_device *dev) if (ret) goto err; - if (dev->config->create_avq) { - ret = dev->config->create_avq(dev); - if (ret) - goto err; - } - if (drv->restore) { ret = drv->restore(dev); if (ret) - goto err_restore; + goto err; } /* If restore didn't do it, mark device DRIVER_OK ourselves. */ @@ -582,9 +561,6 @@ int virtio_device_restore(struct virtio_device *dev) return 0; -err_restore: - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return ret; diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7c5516ae1f8a..267643bb1cd5 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -262,9 +262,6 @@ void vp_del_vqs(struct virtio_device *vdev) int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { - if (vp_dev->is_avq && vp_dev->is_avq(vdev, vq->index)) - continue; - if (vp_dev->per_vq_vectors) { int v = vp_dev->vqs[vq->index]->msix_vector; @@ -371,14 +368,23 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; + struct virtqueue *vq; bool per_vq_vectors; + u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; + if (vp_dev->avq_index) { + err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); + if (err) + goto error_find; + } + per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { @@ -415,6 +421,18 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, goto error_find; } } + + if (!avq_num) + return 0; + sprintf(avq->name, "avq.%u", avq->vq_index); + vq = vp_find_one_vq_msix(vdev, avq->vq_index, NULL, avq->name, false, + true, &allocated_vectors, vector_policy, + &vp_dev->admin_vq.info); + if (IS_ERR(vq)) { + err = PTR_ERR(vq); + goto error_find; + } + return 0; error_find: @@ -427,12 +445,21 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; + struct virtqueue *vq; + u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; + if (vp_dev->avq_index) { + err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); + if (err) + goto out_del_vqs; + } + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) @@ -456,6 +483,16 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, } } + if (!avq_num) + return 0; + sprintf(avq->name, "avq.%u", avq->vq_index); + vq = vp_setup_vq(vdev, queue_idx++, NULL, avq->name, false, + VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); + if (IS_ERR(vq)) { + err = PTR_ERR(vq); + goto out_del_vqs; + } + return 0; out_del_vqs: vp_del_vqs(vdev); diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index aa43875b6353..de59bb06ec3c 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -44,7 +44,7 @@ struct virtio_pci_vq_info { struct virtio_pci_admin_vq { /* Virtqueue info associated with this admin queue. */ - struct virtio_pci_vq_info info; + struct virtio_pci_vq_info *info; /* serializing admin commands execution. */ struct mutex cmd_lock; u64 supported_cmds; @@ -105,7 +105,7 @@ struct virtio_pci_device { void (*del_vq)(struct virtio_pci_vq_info *info); u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector); - bool (*is_avq)(struct virtio_device *vdev, unsigned int index); + int (*avq_index)(struct virtio_device *vdev, u16 *index, u16 *num); }; /* Constants for MSI-X */ diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index d704947b1aec..5ceb4b2c18df 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -63,7 +63,7 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, struct virtqueue *vq; int ret, len; - vq = admin_vq->info.vq; + vq = admin_vq->info->vq; if (!vq) return -EIO; @@ -203,27 +203,12 @@ static void virtio_pci_admin_cmd_list_init(struct virtio_device *virtio_dev) static void vp_modern_avq_activate(struct virtio_device *vdev) { - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return; - __virtqueue_unbreak(admin_vq->info.vq); virtio_pci_admin_cmd_list_init(vdev); } -static void vp_modern_avq_deactivate(struct virtio_device *vdev) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; - - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) - return; - - __virtqueue_break(admin_vq->info.vq); -} - static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -418,8 +403,6 @@ static void vp_reset(struct virtio_device *vdev) while (vp_modern_get_status(mdev)) msleep(1); - vp_modern_avq_deactivate(vdev); - /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); } @@ -595,9 +578,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, goto err; } - if (is_avq) - vp_dev->admin_vq.info.vq = vq; - return vq; err: @@ -741,41 +721,6 @@ static bool vp_get_shm_region(struct virtio_device *vdev, return true; } -static int vp_modern_create_avq(struct virtio_device *vdev) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; - struct virtqueue *vq; - u16 num; - int err; - - err = vp_avq_index(vdev, &avq->vq_index, &num); - if (err || !num) - return err; - - sprintf(avq->name, "avq.%u", avq->vq_index); - vq = vp_dev->setup_vq(vp_dev, &vp_dev->admin_vq.info, avq->vq_index, NULL, - avq->name, NULL, VIRTIO_MSI_NO_VECTOR); - if (IS_ERR(vq)) { - dev_err(&vdev->dev, "failed to setup admin virtqueue, err=%ld", - PTR_ERR(vq)); - return PTR_ERR(vq); - } - - vp_modern_set_queue_enable(&vp_dev->mdev, avq->info.vq->index, true); - return 0; -} - -static void vp_modern_destroy_avq(struct virtio_device *vdev) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) - return; - - vp_dev->del_vq(&vp_dev->admin_vq.info); -} - static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get = NULL, .set = NULL, @@ -794,8 +739,6 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get_shm_region = vp_get_shm_region, .disable_vq_and_reset = vp_modern_disable_vq_and_reset, .enable_vq_after_reset = vp_modern_enable_vq_after_reset, - .create_avq = vp_modern_create_avq, - .destroy_avq = vp_modern_destroy_avq, }; static const struct virtio_config_ops virtio_pci_config_ops = { @@ -816,8 +759,6 @@ static const struct virtio_config_ops virtio_pci_config_ops = { .get_shm_region = vp_get_shm_region, .disable_vq_and_reset = vp_modern_disable_vq_and_reset, .enable_vq_after_reset = vp_modern_enable_vq_after_reset, - .create_avq = vp_modern_create_avq, - .destroy_avq = vp_modern_destroy_avq, }; /* the PCI probing function */ @@ -841,7 +782,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->config_vector = vp_config_vector; vp_dev->setup_vq = setup_vq; vp_dev->del_vq = del_vq; - vp_dev->is_avq = vp_is_avq; + vp_dev->avq_index = vp_avq_index; vp_dev->isr = mdev->isr; vp_dev->vdev.id = mdev->id; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index ab4b9a3fef6b..169c7d367fac 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -104,8 +104,6 @@ struct virtqueue_info { * Returns 0 on success or error status * If disable_vq_and_reset is set, then enable_vq_after_reset must also be * set. - * @create_avq: create admin virtqueue resource. - * @destroy_avq: destroy admin virtqueue resource. */ struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, @@ -133,8 +131,6 @@ struct virtio_config_ops { struct virtio_shm_region *region, u8 id); int (*disable_vq_and_reset)(struct virtqueue *vq); int (*enable_vq_after_reset)(struct virtqueue *vq); - int (*create_avq)(struct virtio_device *vdev); - void (*destroy_avq)(struct virtio_device *vdev); }; /* If driver didn't advertise the feature, it will never appear. */ From b00c4150c435eed6e80ab302ae3a02a3d41460c3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:49 +0200 Subject: [PATCH 0032/1052] virtio_pci_modern: create admin queue of queried size Don't limit the admin queue size to VIRTIO_AVQ_SGS_MAX and rather rely on the queried queue size. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-11-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 5ceb4b2c18df..a649c9dc436d 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -550,8 +550,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (index >= vp_modern_get_num_queues(mdev) && !is_avq) return ERR_PTR(-EINVAL); - num = is_avq ? - VIRTIO_AVQ_SGS_MAX : vp_modern_get_queue_size(mdev, index); + num = vp_modern_get_queue_size(mdev, index); /* Check if queue is either not available or already active. */ if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); From 7090f2b5ad33e9b6ba68eb927b02e8a286d21fb4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:50 +0200 Subject: [PATCH 0033/1052] virtio_pci_modern: pass cmd as an identification token In preparation to asynchronous admin queue processing, pass cmd pointer as a data arg to virtqueue_add_sgs(). Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-12-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index a649c9dc436d..0fd344d1eaf9 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -58,7 +58,7 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, struct scatterlist **sgs, unsigned int out_num, unsigned int in_num, - void *data) + struct virtio_admin_cmd *cmd) { struct virtqueue *vq; int ret, len; @@ -72,7 +72,7 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, !((1ULL << opcode) & admin_vq->supported_cmds)) return -EOPNOTSUPP; - ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, data, GFP_KERNEL); + ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_KERNEL); if (ret < 0) return -EIO; @@ -140,7 +140,7 @@ int vp_modern_admin_cmd_exec(struct virtio_device *vdev, mutex_lock(&vp_dev->admin_vq.cmd_lock); ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, le16_to_cpu(cmd->opcode), - sgs, out_num, in_num, sgs); + sgs, out_num, in_num, cmd); mutex_unlock(&vp_dev->admin_vq.cmd_lock); if (ret) { From 4c3b54af907e709609d3d8beca92d65e2f0cfd83 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:51 +0200 Subject: [PATCH 0034/1052] virtio_pci_modern: use completion instead of busy loop to wait on admin cmd result Currently, the code waits in a busy loop on every admin virtqueue issued command to get a reply. That prevents callers from issuing multiple commands in parallel. To overcome this limitation, introduce a virtqueue event callback for admin virtqueue. For every issued command, use completion mechanism to wait on a reply. In the event callback, trigger the completion is done for every incoming reply. Alongside with that, introduce a spin lock to protect the admin virtqueue operations. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-13-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 13 +++-- drivers/virtio/virtio_pci_common.h | 3 ++ drivers/virtio/virtio_pci_modern.c | 76 +++++++++++++++++++++++++----- include/linux/virtio.h | 3 ++ 4 files changed, 78 insertions(+), 17 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 267643bb1cd5..c44d8ba00c02 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -395,6 +395,8 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, if (vqi->name && vqi->callback) ++nvectors; } + if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) + ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; @@ -425,9 +427,9 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); - vq = vp_find_one_vq_msix(vdev, avq->vq_index, NULL, avq->name, false, - true, &allocated_vectors, vector_policy, - &vp_dev->admin_vq.info); + vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, + avq->name, false, true, &allocated_vectors, + vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; @@ -486,8 +488,9 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); - vq = vp_setup_vq(vdev, queue_idx++, NULL, avq->name, false, - VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); + vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, + false, VIRTIO_MSI_NO_VECTOR, + &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index de59bb06ec3c..90df381fbbcf 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -47,6 +47,8 @@ struct virtio_pci_admin_vq { struct virtio_pci_vq_info *info; /* serializing admin commands execution. */ struct mutex cmd_lock; + /* Protects virtqueue access. */ + spinlock_t lock; u64 supported_cmds; /* Name of the admin queue: avq.$vq_index. */ char name[10]; @@ -178,6 +180,7 @@ struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev); #define VIRTIO_ADMIN_CMD_BITMAP 0 #endif +void vp_modern_avq_done(struct virtqueue *vq); int vp_modern_admin_cmd_exec(struct virtio_device *vdev, struct virtio_admin_cmd *cmd); diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 0fd344d1eaf9..608df3263df1 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -53,6 +53,23 @@ static bool vp_is_avq(struct virtio_device *vdev, unsigned int index) return index == vp_dev->admin_vq.vq_index; } +void vp_modern_avq_done(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; + struct virtio_admin_cmd *cmd; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&admin_vq->lock, flags); + do { + virtqueue_disable_cb(vq); + while ((cmd = virtqueue_get_buf(vq, &len))) + complete(&cmd->completion); + } while (!virtqueue_enable_cb(vq)); + spin_unlock_irqrestore(&admin_vq->lock, flags); +} + static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, u16 opcode, struct scatterlist **sgs, @@ -61,7 +78,8 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, struct virtio_admin_cmd *cmd) { struct virtqueue *vq; - int ret, len; + unsigned long flags; + int ret; vq = admin_vq->info->vq; if (!vq) @@ -72,21 +90,33 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, !((1ULL << opcode) & admin_vq->supported_cmds)) return -EOPNOTSUPP; - ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_KERNEL); - if (ret < 0) - return -EIO; - - if (unlikely(!virtqueue_kick(vq))) - return -EIO; - - while (!virtqueue_get_buf(vq, &len) && - !virtqueue_is_broken(vq)) - cpu_relax(); + init_completion(&cmd->completion); +again: if (virtqueue_is_broken(vq)) return -EIO; - return 0; + spin_lock_irqsave(&admin_vq->lock, flags); + ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_KERNEL); + if (ret < 0) { + if (ret == -ENOSPC) { + spin_unlock_irqrestore(&admin_vq->lock, flags); + cpu_relax(); + goto again; + } + goto unlock_err; + } + if (!virtqueue_kick(vq)) + goto unlock_err; + spin_unlock_irqrestore(&admin_vq->lock, flags); + + wait_for_completion(&cmd->completion); + + return cmd->ret; + +unlock_err: + spin_unlock_irqrestore(&admin_vq->lock, flags); + return -EIO; } int vp_modern_admin_cmd_exec(struct virtio_device *vdev, @@ -209,6 +239,25 @@ static void vp_modern_avq_activate(struct virtio_device *vdev) virtio_pci_admin_cmd_list_init(vdev); } +static void vp_modern_avq_cleanup(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_admin_cmd *cmd; + struct virtqueue *vq; + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return; + + vq = vp_dev->vqs[vp_dev->admin_vq.vq_index]->vq; + if (!vq) + return; + + while ((cmd = virtqueue_detach_unused_buf(vq))) { + cmd->ret = -EIO; + complete(&cmd->completion); + } +} + static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -403,6 +452,8 @@ static void vp_reset(struct virtio_device *vdev) while (vp_modern_get_status(mdev)) msleep(1); + vp_modern_avq_cleanup(vdev); + /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); } @@ -785,6 +836,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->isr = mdev->isr; vp_dev->vdev.id = mdev->id; + spin_lock_init(&vp_dev->admin_vq.lock); mutex_init(&vp_dev->admin_vq.cmd_lock); return 0; } diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 96fea920873b..999ff5934392 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -10,6 +10,7 @@ #include #include #include +#include /** * struct virtqueue - a queue to register buffers for sending or receiving. @@ -109,6 +110,8 @@ struct virtio_admin_cmd { __le64 group_member_id; struct scatterlist *data_sg; struct scatterlist *result_sg; + struct completion completion; + int ret; }; /** From 6d834691da474ed1c648753d3d3a3ef8379fa1c1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:52 +0200 Subject: [PATCH 0035/1052] virtio_pci_modern: remove admin queue serialization lock The admin queue operations are protected by newly introduced spin lock. To make it possible to issue parallel commands, remove the admin queue serialization lock. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-14-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.h | 2 -- drivers/virtio/virtio_pci_modern.c | 5 ----- 2 files changed, 7 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 90df381fbbcf..1d9c49947f52 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -45,8 +45,6 @@ struct virtio_pci_vq_info { struct virtio_pci_admin_vq { /* Virtqueue info associated with this admin queue. */ struct virtio_pci_vq_info *info; - /* serializing admin commands execution. */ - struct mutex cmd_lock; /* Protects virtqueue access. */ spinlock_t lock; u64 supported_cmds; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 608df3263df1..9193c30d640a 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -167,12 +167,9 @@ int vp_modern_admin_cmd_exec(struct virtio_device *vdev, in_num++; } - mutex_lock(&vp_dev->admin_vq.cmd_lock); ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, le16_to_cpu(cmd->opcode), sgs, out_num, in_num, cmd); - mutex_unlock(&vp_dev->admin_vq.cmd_lock); - if (ret) { dev_err(&vdev->dev, "Failed to execute command on admin vq: %d\n.", ret); @@ -837,7 +834,6 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->vdev.id = mdev->id; spin_lock_init(&vp_dev->admin_vq.lock); - mutex_init(&vp_dev->admin_vq.cmd_lock); return 0; } @@ -845,6 +841,5 @@ void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - mutex_destroy(&vp_dev->admin_vq.cmd_lock); vp_modern_remove(mdev); } From f0d17d696dfce77c9abc830e4ac2d677890a2dad Mon Sep 17 00:00:00 2001 From: Tatsunosuke Tobita Date: Tue, 9 Jul 2024 14:57:28 +0900 Subject: [PATCH 0036/1052] HID: wacom: Modify pen IDs The pen ID, 0x80842, was not the correct ID for wacom driver to treat. The ID was corrected to 0x8842. Also, 0x4200 was not the expected ID used on any Wacom device. Therefore, 0x4200 was removed. Signed-off-by: Tatsunosuke Tobita Signed-off-by: Tatsunosuke Tobita Fixes: bfdc750c4cb2 ("HID: wacom: add three styli to wacom_intuos_get_tool_type") Cc: stable@kernel.org #6.2 Reviewed-by: Ping Cheng Link: https://patch.msgid.link/20240709055729.17158-1-tatsunosuke.wacom@gmail.com Signed-off-by: Benjamin Tissoires --- drivers/hid/wacom_wac.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index a44367aef621..20de97ce0f5e 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -714,13 +714,12 @@ static int wacom_intuos_get_tool_type(int tool_id) case 0x8e2: /* IntuosHT2 pen */ case 0x022: case 0x200: /* Pro Pen 3 */ - case 0x04200: /* Pro Pen 3 */ case 0x10842: /* MobileStudio Pro Pro Pen slim */ case 0x14802: /* Intuos4/5 13HD/24HD Classic Pen */ case 0x16802: /* Cintiq 13HD Pro Pen */ case 0x18802: /* DTH2242 Pen */ case 0x10802: /* Intuos4/5 13HD/24HD General Pen */ - case 0x80842: /* Intuos Pro and Cintiq Pro 3D Pen */ + case 0x8842: /* Intuos Pro and Cintiq Pro 3D Pen */ tool_type = BTN_TOOL_PEN; break; From 9c2913b962daf3e5a947babf93f2125765eeca09 Mon Sep 17 00:00:00 2001 From: Tatsunosuke Tobita Date: Tue, 9 Jul 2024 14:57:29 +0900 Subject: [PATCH 0037/1052] HID: wacom: more appropriate tool type categorization Recent eraser and pen tools are able to be distinguished by looking at its least significant nibble. This modification applies it to wacom_intuos_get_tool_type function and reduces its code lines. Signed-off-by: Tatsunosuke Tobita Signed-off-by: Tatsunosuke Tobita Reviewed-by: Ping Cheng Link: https://patch.msgid.link/20240709055729.17158-2-tatsunosuke.wacom@gmail.com Signed-off-by: Benjamin Tissoires --- drivers/hid/wacom_wac.c | 66 +++++++---------------------------------- 1 file changed, 10 insertions(+), 56 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 20de97ce0f5e..1f4564982b95 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -692,77 +692,28 @@ static bool wacom_is_art_pen(int tool_id) static int wacom_intuos_get_tool_type(int tool_id) { - int tool_type = BTN_TOOL_PEN; - - if (wacom_is_art_pen(tool_id)) - return tool_type; - switch (tool_id) { case 0x812: /* Inking pen */ case 0x801: /* Intuos3 Inking pen */ case 0x12802: /* Intuos4/5 Inking Pen */ case 0x012: - tool_type = BTN_TOOL_PENCIL; - break; - - case 0x822: /* Pen */ - case 0x842: - case 0x852: - case 0x823: /* Intuos3 Grip Pen */ - case 0x813: /* Intuos3 Classic Pen */ - case 0x802: /* Intuos4/5 13HD/24HD General Pen */ - case 0x8e2: /* IntuosHT2 pen */ - case 0x022: - case 0x200: /* Pro Pen 3 */ - case 0x10842: /* MobileStudio Pro Pro Pen slim */ - case 0x14802: /* Intuos4/5 13HD/24HD Classic Pen */ - case 0x16802: /* Cintiq 13HD Pro Pen */ - case 0x18802: /* DTH2242 Pen */ - case 0x10802: /* Intuos4/5 13HD/24HD General Pen */ - case 0x8842: /* Intuos Pro and Cintiq Pro 3D Pen */ - tool_type = BTN_TOOL_PEN; - break; + return BTN_TOOL_PENCIL; case 0x832: /* Stroke pen */ case 0x032: - tool_type = BTN_TOOL_BRUSH; - break; + return BTN_TOOL_BRUSH; case 0x007: /* Mouse 4D and 2D */ case 0x09c: case 0x094: case 0x017: /* Intuos3 2D Mouse */ case 0x806: /* Intuos4 Mouse */ - tool_type = BTN_TOOL_MOUSE; - break; + return BTN_TOOL_MOUSE; case 0x096: /* Lens cursor */ case 0x097: /* Intuos3 Lens cursor */ case 0x006: /* Intuos4 Lens cursor */ - tool_type = BTN_TOOL_LENS; - break; - - case 0x82a: /* Eraser */ - case 0x84a: - case 0x85a: - case 0x91a: - case 0xd1a: - case 0x0fa: - case 0x82b: /* Intuos3 Grip Pen Eraser */ - case 0x81b: /* Intuos3 Classic Pen Eraser */ - case 0x91b: /* Intuos3 Airbrush Eraser */ - case 0x80c: /* Intuos4/5 13HD/24HD Marker Pen Eraser */ - case 0x80a: /* Intuos4/5 13HD/24HD General Pen Eraser */ - case 0x90a: /* Intuos4/5 13HD/24HD Airbrush Eraser */ - case 0x1480a: /* Intuos4/5 13HD/24HD Classic Pen Eraser */ - case 0x1090a: /* Intuos4/5 13HD/24HD Airbrush Eraser */ - case 0x1080c: /* Intuos4/5 13HD/24HD Art Pen Eraser */ - case 0x1084a: /* MobileStudio Pro Pro Pen slim Eraser */ - case 0x1680a: /* Cintiq 13HD Pro Pen Eraser */ - case 0x1880a: /* DTH2242 Eraser */ - case 0x1080a: /* Intuos4/5 13HD/24HD General Pen Eraser */ - tool_type = BTN_TOOL_RUBBER; - break; + return BTN_TOOL_LENS; case 0xd12: case 0x912: @@ -770,10 +721,13 @@ static int wacom_intuos_get_tool_type(int tool_id) case 0x913: /* Intuos3 Airbrush */ case 0x902: /* Intuos4/5 13HD/24HD Airbrush */ case 0x10902: /* Intuos4/5 13HD/24HD Airbrush */ - tool_type = BTN_TOOL_AIRBRUSH; - break; + return BTN_TOOL_AIRBRUSH; + + default: + if (tool_id & 0x0008) + return BTN_TOOL_RUBBER; + return BTN_TOOL_PEN; } - return tool_type; } static void wacom_exit_report(struct wacom_wac *wacom) From 6ce9efd12ae81cf46bf44eb0348594558dfbb9d2 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 11 Jul 2024 14:53:30 +0100 Subject: [PATCH 0038/1052] drm/v3d: Prevent out of bounds access in performance query extensions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check that the number of perfmons userspace is passing in the copy and reset extensions is not greater than the internal kernel storage where the ids will be copied into. Signed-off-by: Tvrtko Ursulin Fixes: bae7cb5d6800 ("drm/v3d: Create a CPU job extension for the reset performance query job") Cc: Maíra Canal Cc: Iago Toral Quiroga Cc: stable@vger.kernel.org # v6.8+ Reviewed-by: Iago Toral Quiroga Reviewed-by: Maíra Canal Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20240711135340.84617-2-tursulin@igalia.com (cherry picked from commit f32b5128d2c440368b5bf3a7a356823e235caabb) Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/v3d/v3d_submit.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c index 88f63d526b22..263fefc1d04f 100644 --- a/drivers/gpu/drm/v3d/v3d_submit.c +++ b/drivers/gpu/drm/v3d/v3d_submit.c @@ -637,6 +637,9 @@ v3d_get_cpu_reset_performance_params(struct drm_file *file_priv, if (copy_from_user(&reset, ext, sizeof(reset))) return -EFAULT; + if (reset.nperfmons > V3D_MAX_PERFMONS) + return -EINVAL; + job->job_type = V3D_CPU_JOB_TYPE_RESET_PERFORMANCE_QUERY; job->performance_query.queries = kvmalloc_array(reset.count, @@ -708,6 +711,9 @@ v3d_get_cpu_copy_performance_query_params(struct drm_file *file_priv, if (copy.pad) return -EINVAL; + if (copy.nperfmons > V3D_MAX_PERFMONS) + return -EINVAL; + job->job_type = V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY; job->performance_query.queries = kvmalloc_array(copy.count, From 0e50fcc20bd87584840266e8004f9064a8985b4f Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 11 Jul 2024 14:53:31 +0100 Subject: [PATCH 0039/1052] drm/v3d: Fix potential memory leak in the timestamp extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If fetching of userspace memory fails during the main loop, all drm sync objs looked up until that point will be leaked because of the missing drm_syncobj_put. Fix it by exporting and using a common cleanup helper. Signed-off-by: Tvrtko Ursulin Fixes: 9ba0ff3e083f ("drm/v3d: Create a CPU job extension for the timestamp query job") Cc: Maíra Canal Cc: Iago Toral Quiroga Cc: stable@vger.kernel.org # v6.8+ Reviewed-by: Maíra Canal Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20240711135340.84617-3-tursulin@igalia.com (cherry picked from commit 753ce4fea62182c77e1691ab4f9022008f25b62e) Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/v3d/v3d_drv.h | 2 ++ drivers/gpu/drm/v3d/v3d_sched.c | 22 +++++++++++----- drivers/gpu/drm/v3d/v3d_submit.c | 43 ++++++++++++++++++++++---------- 3 files changed, 48 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index a2c516fe6d79..c46eed35d26b 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -556,6 +556,8 @@ void v3d_mmu_insert_ptes(struct v3d_bo *bo); void v3d_mmu_remove_ptes(struct v3d_bo *bo); /* v3d_sched.c */ +void v3d_timestamp_query_info_free(struct v3d_timestamp_query_info *query_info, + unsigned int count); void v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue); int v3d_sched_init(struct v3d_dev *v3d); void v3d_sched_fini(struct v3d_dev *v3d); diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 7cd8c335cd9b..3da4fa49552b 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -73,18 +73,28 @@ v3d_sched_job_free(struct drm_sched_job *sched_job) v3d_job_cleanup(job); } +void +v3d_timestamp_query_info_free(struct v3d_timestamp_query_info *query_info, + unsigned int count) +{ + if (query_info->queries) { + unsigned int i; + + for (i = 0; i < count; i++) + drm_syncobj_put(query_info->queries[i].syncobj); + + kvfree(query_info->queries); + } +} + static void v3d_cpu_job_free(struct drm_sched_job *sched_job) { struct v3d_cpu_job *job = to_cpu_job(sched_job); - struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query; struct v3d_performance_query_info *performance_query = &job->performance_query; - if (timestamp_query->queries) { - for (int i = 0; i < timestamp_query->count; i++) - drm_syncobj_put(timestamp_query->queries[i].syncobj); - kvfree(timestamp_query->queries); - } + v3d_timestamp_query_info_free(&job->timestamp_query, + job->timestamp_query.count); if (performance_query->queries) { for (int i = 0; i < performance_query->count; i++) diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c index 263fefc1d04f..121bf1314b80 100644 --- a/drivers/gpu/drm/v3d/v3d_submit.c +++ b/drivers/gpu/drm/v3d/v3d_submit.c @@ -452,6 +452,8 @@ v3d_get_cpu_timestamp_query_params(struct drm_file *file_priv, { u32 __user *offsets, *syncs; struct drm_v3d_timestamp_query timestamp; + unsigned int i; + int err; if (!job) { DRM_DEBUG("CPU job extension was attached to a GPU job.\n"); @@ -480,19 +482,19 @@ v3d_get_cpu_timestamp_query_params(struct drm_file *file_priv, offsets = u64_to_user_ptr(timestamp.offsets); syncs = u64_to_user_ptr(timestamp.syncs); - for (int i = 0; i < timestamp.count; i++) { + for (i = 0; i < timestamp.count; i++) { u32 offset, sync; if (copy_from_user(&offset, offsets++, sizeof(offset))) { - kvfree(job->timestamp_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } job->timestamp_query.queries[i].offset = offset; if (copy_from_user(&sync, syncs++, sizeof(sync))) { - kvfree(job->timestamp_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } job->timestamp_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); @@ -500,6 +502,10 @@ v3d_get_cpu_timestamp_query_params(struct drm_file *file_priv, job->timestamp_query.count = timestamp.count; return 0; + +error: + v3d_timestamp_query_info_free(&job->timestamp_query, i); + return err; } static int @@ -509,6 +515,8 @@ v3d_get_cpu_reset_timestamp_params(struct drm_file *file_priv, { u32 __user *syncs; struct drm_v3d_reset_timestamp_query reset; + unsigned int i; + int err; if (!job) { DRM_DEBUG("CPU job extension was attached to a GPU job.\n"); @@ -533,14 +541,14 @@ v3d_get_cpu_reset_timestamp_params(struct drm_file *file_priv, syncs = u64_to_user_ptr(reset.syncs); - for (int i = 0; i < reset.count; i++) { + for (i = 0; i < reset.count; i++) { u32 sync; job->timestamp_query.queries[i].offset = reset.offset + 8 * i; if (copy_from_user(&sync, syncs++, sizeof(sync))) { - kvfree(job->timestamp_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } job->timestamp_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); @@ -548,6 +556,10 @@ v3d_get_cpu_reset_timestamp_params(struct drm_file *file_priv, job->timestamp_query.count = reset.count; return 0; + +error: + v3d_timestamp_query_info_free(&job->timestamp_query, i); + return err; } /* Get data for the copy timestamp query results job submission. */ @@ -558,7 +570,8 @@ v3d_get_cpu_copy_query_results_params(struct drm_file *file_priv, { u32 __user *offsets, *syncs; struct drm_v3d_copy_timestamp_query copy; - int i; + unsigned int i; + int err; if (!job) { DRM_DEBUG("CPU job extension was attached to a GPU job.\n"); @@ -591,15 +604,15 @@ v3d_get_cpu_copy_query_results_params(struct drm_file *file_priv, u32 offset, sync; if (copy_from_user(&offset, offsets++, sizeof(offset))) { - kvfree(job->timestamp_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } job->timestamp_query.queries[i].offset = offset; if (copy_from_user(&sync, syncs++, sizeof(sync))) { - kvfree(job->timestamp_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } job->timestamp_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); @@ -613,6 +626,10 @@ v3d_get_cpu_copy_query_results_params(struct drm_file *file_priv, job->copy.stride = copy.stride; return 0; + +error: + v3d_timestamp_query_info_free(&job->timestamp_query, i); + return err; } static int From 32df4abc44f24dbec239d43e2b26d5768c5d1a78 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 11 Jul 2024 14:53:32 +0100 Subject: [PATCH 0040/1052] drm/v3d: Fix potential memory leak in the performance extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If fetching of userspace memory fails during the main loop, all drm sync objs looked up until that point will be leaked because of the missing drm_syncobj_put. Fix it by exporting and using a common cleanup helper. Signed-off-by: Tvrtko Ursulin Fixes: bae7cb5d6800 ("drm/v3d: Create a CPU job extension for the reset performance query job") Cc: Maíra Canal Cc: Iago Toral Quiroga Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20240711135340.84617-4-tursulin@igalia.com (cherry picked from commit 484de39fa5f5b7bd0c5f2e2c5265167250ef7501) Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/v3d/v3d_drv.h | 2 ++ drivers/gpu/drm/v3d/v3d_sched.c | 22 ++++++++++---- drivers/gpu/drm/v3d/v3d_submit.c | 52 ++++++++++++++++++++------------ 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index c46eed35d26b..1d535abedc57 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -558,6 +558,8 @@ void v3d_mmu_remove_ptes(struct v3d_bo *bo); /* v3d_sched.c */ void v3d_timestamp_query_info_free(struct v3d_timestamp_query_info *query_info, unsigned int count); +void v3d_performance_query_info_free(struct v3d_performance_query_info *query_info, + unsigned int count); void v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue); int v3d_sched_init(struct v3d_dev *v3d); void v3d_sched_fini(struct v3d_dev *v3d); diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 3da4fa49552b..30d5366d6288 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -87,20 +87,30 @@ v3d_timestamp_query_info_free(struct v3d_timestamp_query_info *query_info, } } +void +v3d_performance_query_info_free(struct v3d_performance_query_info *query_info, + unsigned int count) +{ + if (query_info->queries) { + unsigned int i; + + for (i = 0; i < count; i++) + drm_syncobj_put(query_info->queries[i].syncobj); + + kvfree(query_info->queries); + } +} + static void v3d_cpu_job_free(struct drm_sched_job *sched_job) { struct v3d_cpu_job *job = to_cpu_job(sched_job); - struct v3d_performance_query_info *performance_query = &job->performance_query; v3d_timestamp_query_info_free(&job->timestamp_query, job->timestamp_query.count); - if (performance_query->queries) { - for (int i = 0; i < performance_query->count; i++) - drm_syncobj_put(performance_query->queries[i].syncobj); - kvfree(performance_query->queries); - } + v3d_performance_query_info_free(&job->performance_query, + job->performance_query.count); v3d_job_cleanup(&job->base); } diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c index 121bf1314b80..50be4e8a7512 100644 --- a/drivers/gpu/drm/v3d/v3d_submit.c +++ b/drivers/gpu/drm/v3d/v3d_submit.c @@ -640,6 +640,8 @@ v3d_get_cpu_reset_performance_params(struct drm_file *file_priv, u32 __user *syncs; u64 __user *kperfmon_ids; struct drm_v3d_reset_performance_query reset; + unsigned int i, j; + int err; if (!job) { DRM_DEBUG("CPU job extension was attached to a GPU job.\n"); @@ -668,39 +670,43 @@ v3d_get_cpu_reset_performance_params(struct drm_file *file_priv, syncs = u64_to_user_ptr(reset.syncs); kperfmon_ids = u64_to_user_ptr(reset.kperfmon_ids); - for (int i = 0; i < reset.count; i++) { + for (i = 0; i < reset.count; i++) { u32 sync; u64 ids; u32 __user *ids_pointer; u32 id; if (copy_from_user(&sync, syncs++, sizeof(sync))) { - kvfree(job->performance_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } - job->performance_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); - if (copy_from_user(&ids, kperfmon_ids++, sizeof(ids))) { - kvfree(job->performance_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } ids_pointer = u64_to_user_ptr(ids); - for (int j = 0; j < reset.nperfmons; j++) { + for (j = 0; j < reset.nperfmons; j++) { if (copy_from_user(&id, ids_pointer++, sizeof(id))) { - kvfree(job->performance_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } job->performance_query.queries[i].kperfmon_ids[j] = id; } + + job->performance_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); } job->performance_query.count = reset.count; job->performance_query.nperfmons = reset.nperfmons; return 0; + +error: + v3d_performance_query_info_free(&job->performance_query, i); + return err; } static int @@ -711,6 +717,8 @@ v3d_get_cpu_copy_performance_query_params(struct drm_file *file_priv, u32 __user *syncs; u64 __user *kperfmon_ids; struct drm_v3d_copy_performance_query copy; + unsigned int i, j; + int err; if (!job) { DRM_DEBUG("CPU job extension was attached to a GPU job.\n"); @@ -742,34 +750,34 @@ v3d_get_cpu_copy_performance_query_params(struct drm_file *file_priv, syncs = u64_to_user_ptr(copy.syncs); kperfmon_ids = u64_to_user_ptr(copy.kperfmon_ids); - for (int i = 0; i < copy.count; i++) { + for (i = 0; i < copy.count; i++) { u32 sync; u64 ids; u32 __user *ids_pointer; u32 id; if (copy_from_user(&sync, syncs++, sizeof(sync))) { - kvfree(job->performance_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } - job->performance_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); - if (copy_from_user(&ids, kperfmon_ids++, sizeof(ids))) { - kvfree(job->performance_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } ids_pointer = u64_to_user_ptr(ids); - for (int j = 0; j < copy.nperfmons; j++) { + for (j = 0; j < copy.nperfmons; j++) { if (copy_from_user(&id, ids_pointer++, sizeof(id))) { - kvfree(job->performance_query.queries); - return -EFAULT; + err = -EFAULT; + goto error; } job->performance_query.queries[i].kperfmon_ids[j] = id; } + + job->performance_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); } job->performance_query.count = copy.count; job->performance_query.nperfmons = copy.nperfmons; @@ -782,6 +790,10 @@ v3d_get_cpu_copy_performance_query_params(struct drm_file *file_priv, job->copy.stride = copy.stride; return 0; + +error: + v3d_performance_query_info_free(&job->performance_query, i); + return err; } /* Whenever userspace sets ioctl extensions, v3d_get_extensions parses data From 023d22e8bb0cdd6900382ad1ed06df3b6c2ea791 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 11 Jul 2024 14:53:33 +0100 Subject: [PATCH 0041/1052] drm/v3d: Validate passed in drm syncobj handles in the timestamp extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If userspace provides an unknown or invalid handle anywhere in the handle array the rest of the driver will not handle that well. Fix it by checking handle was looked up successfully or otherwise fail the extension by jumping into the existing unwind. Signed-off-by: Tvrtko Ursulin Fixes: 9ba0ff3e083f ("drm/v3d: Create a CPU job extension for the timestamp query job") Cc: Maíra Canal Cc: Iago Toral Quiroga Cc: stable@vger.kernel.org # v6.8+ Reviewed-by: Maíra Canal Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20240711135340.84617-5-tursulin@igalia.com (cherry picked from commit 8d1276d1b8f738c3afe1457d4dff5cc66fc848a3) Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/v3d/v3d_submit.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c index 50be4e8a7512..9a3e32075ebe 100644 --- a/drivers/gpu/drm/v3d/v3d_submit.c +++ b/drivers/gpu/drm/v3d/v3d_submit.c @@ -498,6 +498,10 @@ v3d_get_cpu_timestamp_query_params(struct drm_file *file_priv, } job->timestamp_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); + if (!job->timestamp_query.queries[i].syncobj) { + err = -ENOENT; + goto error; + } } job->timestamp_query.count = timestamp.count; @@ -552,6 +556,10 @@ v3d_get_cpu_reset_timestamp_params(struct drm_file *file_priv, } job->timestamp_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); + if (!job->timestamp_query.queries[i].syncobj) { + err = -ENOENT; + goto error; + } } job->timestamp_query.count = reset.count; @@ -616,6 +624,10 @@ v3d_get_cpu_copy_query_results_params(struct drm_file *file_priv, } job->timestamp_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); + if (!job->timestamp_query.queries[i].syncobj) { + err = -ENOENT; + goto error; + } } job->timestamp_query.count = copy.count; From 4ecc24a84d7e0254efd150ec23e0b89638386516 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 11 Jul 2024 14:53:34 +0100 Subject: [PATCH 0042/1052] drm/v3d: Validate passed in drm syncobj handles in the performance extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If userspace provides an unknown or invalid handle anywhere in the handle array the rest of the driver will not handle that well. Fix it by checking handle was looked up successfully or otherwise fail the extension by jumping into the existing unwind. Signed-off-by: Tvrtko Ursulin Fixes: bae7cb5d6800 ("drm/v3d: Create a CPU job extension for the reset performance query job") Cc: Maíra Canal Cc: Iago Toral Quiroga Cc: stable@vger.kernel.org # v6.8+ Reviewed-by: Maíra Canal Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20240711135340.84617-6-tursulin@igalia.com (cherry picked from commit a546b7e4d73c23838d7e4d2c92882b3ca902d213) Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/v3d/v3d_submit.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c index 9a3e32075ebe..4cdfabbf4964 100644 --- a/drivers/gpu/drm/v3d/v3d_submit.c +++ b/drivers/gpu/drm/v3d/v3d_submit.c @@ -710,6 +710,10 @@ v3d_get_cpu_reset_performance_params(struct drm_file *file_priv, } job->performance_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); + if (!job->performance_query.queries[i].syncobj) { + err = -ENOENT; + goto error; + } } job->performance_query.count = reset.count; job->performance_query.nperfmons = reset.nperfmons; @@ -790,6 +794,10 @@ v3d_get_cpu_copy_performance_query_params(struct drm_file *file_priv, } job->performance_query.queries[i].syncobj = drm_syncobj_find(file_priv, sync); + if (!job->performance_query.queries[i].syncobj) { + err = -ENOENT; + goto error; + } } job->performance_query.count = copy.count; job->performance_query.nperfmons = copy.nperfmons; From 12c35c5582acb0fd8f7713ffa75f450766022ff1 Mon Sep 17 00:00:00 2001 From: Jammy Huang Date: Thu, 18 Jul 2024 11:03:52 +0800 Subject: [PATCH 0043/1052] drm/ast: Fix black screen after resume Suspend will disable pcie device. Thus, resume should do full hw initialization again. Add some APIs to ast_drm_thaw() before ast_post_gpu() to fix the issue. v2: - fix function-call arguments Fixes: 5b71707dd13c ("drm/ast: Enable and unlock device access early during init") Reported-by: Cary Garrett Closes: https://lore.kernel.org/dri-devel/8ce1e1cc351153a890b65e62fed93b54ccd43f6a.camel@gmail.com/ Cc: Thomas Zimmermann Cc: Jocelyn Falempe Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Cc: # v6.6+ Signed-off-by: Jammy Huang Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20240718030352.654155-1-jammy_huang@aspeedtech.com --- drivers/gpu/drm/ast/ast_drv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/ast/ast_drv.c b/drivers/gpu/drm/ast/ast_drv.c index f8c49ba68e78..af2368f6f00f 100644 --- a/drivers/gpu/drm/ast/ast_drv.c +++ b/drivers/gpu/drm/ast/ast_drv.c @@ -391,6 +391,11 @@ static int ast_drm_freeze(struct drm_device *dev) static int ast_drm_thaw(struct drm_device *dev) { + struct ast_device *ast = to_ast_device(dev); + + ast_enable_vga(ast->ioregs); + ast_open_key(ast->ioregs); + ast_enable_mmio(dev->dev, ast->ioregs); ast_post_gpu(dev); return drm_mode_config_helper_resume(dev); From 785280973472dbdee2c31cb740633c4b6460a8ee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 12:54:22 +0200 Subject: [PATCH 0044/1052] sunrpc: avoid -Wformat-security warning Using a non-constant string as an sprintf-style is potentially dangerous: net/sunrpc/svc.c: In function 'param_get_pool_mode': net/sunrpc/svc.c:164:32: error: format not a string literal and no format arguments [-Werror=format-security] Use a literal "%s" format instead. Fixes: 5f71f3c32553 ("sunrpc: refactor pool_mode setting code") Signed-off-by: Arnd Bergmann Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e03f14024e47..88a59cfa5583 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -161,7 +161,7 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp) str[len] = '\n'; str[len + 1] = '\0'; - return sysfs_emit(buf, str); + return sysfs_emit(buf, "%s", str); } module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode, From 66558537cb8c7485cf2d43b3880395c81218ec12 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 22 Jul 2024 11:52:26 +0000 Subject: [PATCH 0045/1052] media: uvcvideo: Fix custom control mapping probing Custom control mapping introduced a bug, where the filter function was applied to every single control. Fix it so it is only applied to the matching controls. The following dmesg errors during probe are now fixed: usb 1-5: Found UVC 1.00 device Integrated_Webcam_HD (0c45:670c) usb 1-5: Failed to query (GET_CUR) UVC control 2 on unit 2: -75 (exp. 1). usb 1-5: Failed to query (GET_CUR) UVC control 3 on unit 2: -75 (exp. 1). usb 1-5: Failed to query (GET_CUR) UVC control 6 on unit 2: -75 (exp. 1). usb 1-5: Failed to query (GET_CUR) UVC control 7 on unit 2: -75 (exp. 1). usb 1-5: Failed to query (GET_CUR) UVC control 8 on unit 2: -75 (exp. 1). usb 1-5: Failed to query (GET_CUR) UVC control 9 on unit 2: -75 (exp. 1). usb 1-5: Failed to query (GET_CUR) UVC control 10 on unit 2: -75 (exp. 1). Reported-by: Paul Menzel Closes: https://lore.kernel.org/linux-media/518cd6b4-68a8-4895-b8fc-97d4dae1ddc4@molgen.mpg.de/T/#t Cc: stable@vger.kernel.org Fixes: 8f4362a8d42b ("media: uvcvideo: Allow custom control mapping") Signed-off-by: Ricardo Ribalda Link: https://lore.kernel.org/r/20240722-fix-filter-mapping-v2-1-7ed5bb6c1185@chromium.org Tested-by: Paul Menzel Reviewed-by: Laurent Pinchart Signed-off-by: Laurent Pinchart --- drivers/media/usb/uvc/uvc_ctrl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c index 0136df5732ba..4fe26e82e3d1 100644 --- a/drivers/media/usb/uvc/uvc_ctrl.c +++ b/drivers/media/usb/uvc/uvc_ctrl.c @@ -2680,6 +2680,10 @@ static void uvc_ctrl_init_ctrl(struct uvc_video_chain *chain, for (i = 0; i < ARRAY_SIZE(uvc_ctrl_mappings); ++i) { const struct uvc_control_mapping *mapping = &uvc_ctrl_mappings[i]; + if (!uvc_entity_match_guid(ctrl->entity, mapping->entity) || + ctrl->info.selector != mapping->selector) + continue; + /* Let the device provide a custom mapping. */ if (mapping->filter_mapping) { mapping = mapping->filter_mapping(chain, ctrl); @@ -2687,9 +2691,7 @@ static void uvc_ctrl_init_ctrl(struct uvc_video_chain *chain, continue; } - if (uvc_entity_match_guid(ctrl->entity, mapping->entity) && - ctrl->info.selector == mapping->selector) - __uvc_ctrl_add_mapping(chain, ctrl, mapping); + __uvc_ctrl_add_mapping(chain, ctrl, mapping); } } From 91da337e5d506f2c065d20529d105ca40090e320 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 19 Jul 2024 14:55:53 -0400 Subject: [PATCH 0046/1052] nfsd: don't set SVC_SOCK_ANONYMOUS when creating nfsd sockets When creating nfsd sockets via the netlink interface, we do want to register with the portmapper. Don't set SVC_SOCK_ANONYMOUS. Reported-by: Steve Dickson Fixes: 16a471177496 ("NFSD: add listener-{set,get} netlink command") Cc: Lorenzo Bianconi Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9e0ea6fc2aa3..34eb2c2cbcde 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2069,8 +2069,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) continue; } - ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, - SVC_SOCK_ANONYMOUS, + ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0, get_current_cred()); /* always save the latest error */ if (ret < 0) From bacc15e010fc5a235fb2020b06a29a9961b5db82 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 11:51:07 +0200 Subject: [PATCH 0047/1052] hid: bpf: add BPF_JIT dependency The module does not do anything when the JIT is disabled, but instead causes a warning: In file included from include/linux/bpf_verifier.h:7, from drivers/hid/bpf/hid_bpf_struct_ops.c:10: drivers/hid/bpf/hid_bpf_struct_ops.c: In function 'hid_bpf_struct_ops_init': include/linux/bpf.h:1853:50: error: statement with no effect [-Werror=unused-value] 1853 | #define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; }) | ^~~~~~~~~~~~~~~~ drivers/hid/bpf/hid_bpf_struct_ops.c:305:16: note: in expansion of macro 'register_bpf_struct_ops' 305 | return register_bpf_struct_ops(&bpf_hid_bpf_ops, hid_bpf_ops); | ^~~~~~~~~~~~~~~~~~~~~~~ Add a Kconfig dependency to only allow building the HID-BPF support when a JIT is enabled. Fixes: ebc0d8093e8c ("HID: bpf: implement HID-BPF through bpf_struct_ops") Signed-off-by: Arnd Bergmann Link: https://patch.msgid.link/96a00b6f-eb81-4c67-8c4b-6b1f3f045034@app.fastmail.com Signed-off-by: Benjamin Tissoires --- drivers/hid/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/bpf/Kconfig b/drivers/hid/bpf/Kconfig index 83214bae6768..d65482e02a6c 100644 --- a/drivers/hid/bpf/Kconfig +++ b/drivers/hid/bpf/Kconfig @@ -3,7 +3,7 @@ menu "HID-BPF support" config HID_BPF bool "HID-BPF support" - depends on BPF + depends on BPF_JIT depends on BPF_SYSCALL depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS help From a9bf3efc33f1fbf88787a277f7349459283c9b95 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Thu, 18 Jul 2024 18:58:46 +0200 Subject: [PATCH 0048/1052] drm/nouveau: prime: fix refcount underflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling nouveau_bo_ref() on a nouveau_bo without initializing it (and hence the backing ttm_bo) leads to a refcount underflow. Instead of calling nouveau_bo_ref() in the unwind path of drm_gem_object_init(), clean things up manually. Fixes: ab9ccb96a6e6 ("drm/nouveau: use prime helpers") Reviewed-by: Ben Skeggs Reviewed-by: Christian König Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240718165959.3983-2-dakr@kernel.org (cherry picked from commit 1b93f3e89d03cfc576636e195466a0d728ad8de5) Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/nouveau/nouveau_prime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_prime.c b/drivers/gpu/drm/nouveau/nouveau_prime.c index b58ab595faf8..cd95446d6851 100644 --- a/drivers/gpu/drm/nouveau/nouveau_prime.c +++ b/drivers/gpu/drm/nouveau/nouveau_prime.c @@ -64,7 +64,8 @@ struct drm_gem_object *nouveau_gem_prime_import_sg_table(struct drm_device *dev, * to the caller, instead of a normal nouveau_bo ttm reference. */ ret = drm_gem_object_init(dev, &nvbo->bo.base, size); if (ret) { - nouveau_bo_ref(NULL, &nvbo); + drm_gem_object_release(&nvbo->bo.base); + kfree(nvbo); obj = ERR_PTR(-ENOMEM); goto unlock; } From da3e19ef0b3de0aa4b25595bdc214c02a04f19b8 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 16 Jul 2024 18:11:01 +0200 Subject: [PATCH 0049/1052] scsi: Revert "scsi: sd: Do not repeat the starting disk message" This reverts commit 7a6bbc2829d4ab592c7e440a6f6f5deb3cd95db4. The offending commit tried to suppress a double "Starting disk" message for some drivers, but instead started spamming the log with bogus messages every five seconds: [ 311.798956] sd 0:0:0:0: [sda] Starting disk [ 316.919103] sd 0:0:0:0: [sda] Starting disk [ 322.040775] sd 0:0:0:0: [sda] Starting disk [ 327.161140] sd 0:0:0:0: [sda] Starting disk [ 332.281352] sd 0:0:0:0: [sda] Starting disk [ 337.401878] sd 0:0:0:0: [sda] Starting disk [ 342.521527] sd 0:0:0:0: [sda] Starting disk [ 345.850401] sd 0:0:0:0: [sda] Starting disk [ 350.967132] sd 0:0:0:0: [sda] Starting disk [ 356.090454] sd 0:0:0:0: [sda] Starting disk ... on machines that do not actually stop the disk on runtime suspend (e.g. the Qualcomm sc8280xp CRD with UFS). Let's just revert for now to address the regression. Fixes: 7a6bbc2829d4 ("scsi: sd: Do not repeat the starting disk message") Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240716161101.30692-1-johan+linaro@kernel.org Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 6203915945a4..fe82baa924f8 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -4117,6 +4117,8 @@ static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); + sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); + if (opal_unlock_from_suspend(sdkp->opal_dev)) { sd_printk(KERN_NOTICE, sdkp, "OPAL unlock failed\n"); return -EIO; @@ -4133,13 +4135,12 @@ static int sd_resume_common(struct device *dev, bool runtime) if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; - sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); - if (!sd_do_start_stop(sdkp->device, runtime)) { sdkp->suspended = false; return 0; } + sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); ret = sd_start_stop_device(sdkp, 1); if (!ret) { sd_resume(dev); From ac6efb12ca64156f4a94e964acdb96ee7d59630d Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 18 Jul 2024 22:36:59 +0530 Subject: [PATCH 0050/1052] scsi: ufs: core: Do not set link to OFF state while waking up from hibernation UFS link is just put into hibern8 state during the 'freeze' process of the hibernation. Afterwards, the system may get powered down. But that doesn't matter during wakeup. Because during wakeup from hibernation, UFS link is again put into hibern8 state by the restore kernel and then the control is handed over to the to image kernel. So in both the places, UFS link is never turned OFF. But ufshcd_system_restore() just assumes that the link will be in OFF state and sets the link state accordingly. And this breaks hibernation wakeup: [ 2445.371335] phy phy-1d87000.phy.3: phy_power_on was called before phy_init [ 2445.427883] ufshcd-qcom 1d84000.ufshc: Controller enable failed [ 2445.427890] ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 [ 2445.427906] ufs_device_wlun 0:0:0:49488: ufshcd_wl_resume failed: -5 [ 2445.427918] ufs_device_wlun 0:0:0:49488: PM: dpm_run_callback(): scsi_bus_restore returns -5 [ 2445.427973] ufs_device_wlun 0:0:0:49488: PM: failed to restore async: error -5 So fix the issue by removing the code that sets the link to OFF state. Cc: Anjana Hari Cc: stable@vger.kernel.org # 6.3 Fixes: 88441a8d355d ("scsi: ufs: core: Add hibernation callbacks") Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240718170659.201647-1-manivannan.sadhasivam@linaro.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index e2bc3553f7df..526f9cb11a81 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10277,9 +10277,6 @@ int ufshcd_system_restore(struct device *dev) */ ufshcd_readl(hba, REG_UTP_TASK_REQ_LIST_BASE_H); - /* Resuming from hibernate, assume that link was OFF */ - ufshcd_set_link_off(hba); - return 0; } From 1abc900ddda8ad2ef739fedf498d415655b6c3b8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 19 Jul 2024 16:39:11 +0900 Subject: [PATCH 0051/1052] scsi: mpi3mr: Avoid IOMMU page faults on REPORT ZONES Some firmware versions of the 9600 series SAS HBA byte-swap the REPORT ZONES command reply buffer from ATA-ZAC devices by directly accessing the buffer in the host memory. This does not respect the default command DMA direction and causes IOMMU page faults on architectures with an IOMMU enforcing write-only mappings for DMA_FROM_DEVICE DMA direction (e.g. AMD hosts), leading to the device capacity to be dropped to 0: scsi 18:0:58:0: Direct-Access-ZBC ATA WDC WSH722626AL W930 PQ: 0 ANSI: 7 scsi 18:0:58:0: Power-on or device reset occurred sd 18:0:58:0: Attached scsi generic sg9 type 20 sd 18:0:58:0: [sdj] Host-managed zoned block device mpi3mr 0000:c1:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0001 address=0xfec0c400 flags=0x0050] mpi3mr 0000:c1:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0001 address=0xfec0c500 flags=0x0050] sd 18:0:58:0: [sdj] REPORT ZONES start lba 0 failed sd 18:0:58:0: [sdj] REPORT ZONES: Result: hostbyte=DID_SOFT_ERROR driverbyte=DRIVER_OK sd 18:0:58:0: [sdj] 0 4096-byte logical blocks: (0 B/0 B) sd 18:0:58:0: [sdj] Write Protect is off sd 18:0:58:0: [sdj] Mode Sense: 6b 00 10 08 sd 18:0:58:0: [sdj] Write cache: enabled, read cache: enabled, supports DPO and FUA sd 18:0:58:0: [sdj] Attached SCSI disk Avoid this issue by always mapping the buffer of REPORT ZONES commands using DMA_BIDIRECTIONAL, that is, using a read-write IOMMU mapping. Suggested-by: Christoph Hellwig Fixes: 023ab2a9b4ed ("scsi: mpi3mr: Add support for queue command processing") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240719073913.179559-2-dlemoal@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_os.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index 69b14918de59..ca8f132e03ae 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -3575,6 +3575,17 @@ static int mpi3mr_prepare_sg_scmd(struct mpi3mr_ioc *mrioc, scmd->sc_data_direction); priv->meta_sg_valid = 1; /* To unmap meta sg DMA */ } else { + /* + * Some firmware versions byte-swap the REPORT ZONES command + * reply from ATA-ZAC devices by directly accessing in the host + * buffer. This does not respect the default command DMA + * direction and causes IOMMU page faults on some architectures + * with an IOMMU enforcing write mappings (e.g. AMD hosts). + * Avoid such issue by making the REPORT ZONES buffer mapping + * bi-directional. + */ + if (scmd->cmnd[0] == ZBC_IN && scmd->cmnd[1] == ZI_REPORT_ZONES) + scmd->sc_data_direction = DMA_BIDIRECTIONAL; sg_scmd = scsi_sglist(scmd); sges_left = scsi_dma_map(scmd); } From 82dbb57ac8d06dfe8227ba9ab11a49de2b475ae5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 19 Jul 2024 16:39:12 +0900 Subject: [PATCH 0052/1052] scsi: mpt3sas: Avoid IOMMU page faults on REPORT ZONES Some firmware versions of the 9600 series SAS HBA byte-swap the REPORT ZONES command reply buffer from ATA-ZAC devices by directly accessing the buffer in the host memory. This does not respect the default command DMA direction and causes IOMMU page faults on architectures with an IOMMU enforcing write-only mappings for DMA_FROM_DEVICE DMA driection (e.g. AMD hosts). scsi 18:0:0:0: Direct-Access-ZBC ATA WDC WSH722020AL W870 PQ: 0 ANSI: 6 scsi 18:0:0:0: SATA: handle(0x0027), sas_addr(0x300062b2083e7c40), phy(0), device_name(0x5000cca29dc35e11) scsi 18:0:0:0: enclosure logical id (0x300062b208097c40), slot(0) scsi 18:0:0:0: enclosure level(0x0000), connector name( C0.0) scsi 18:0:0:0: atapi(n), ncq(y), asyn_notify(n), smart(y), fua(y), sw_preserve(y) scsi 18:0:0:0: qdepth(32), tagged(1), scsi_level(7), cmd_que(1) sd 18:0:0:0: Attached scsi generic sg2 type 20 sd 18:0:0:0: [sdc] Host-managed zoned block device mpt3sas 0000:41:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0021 address=0xfff9b200 flags=0x0050] mpt3sas 0000:41:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0021 address=0xfff9b300 flags=0x0050] mpt3sas_cm0: mpt3sas_ctl_pre_reset_handler: Releasing the trace buffer due to adapter reset. mpt3sas_cm0 fault info from func: mpt3sas_base_make_ioc_ready mpt3sas_cm0: fault_state(0x2666)! mpt3sas_cm0: sending diag reset !! mpt3sas_cm0: diag reset: SUCCESS sd 18:0:0:0: [sdc] REPORT ZONES start lba 0 failed sd 18:0:0:0: [sdc] REPORT ZONES: Result: hostbyte=DID_RESET driverbyte=DRIVER_OK sd 18:0:0:0: [sdc] 0 4096-byte logical blocks: (0 B/0 B) Avoid such issue by always mapping the buffer of REPORT ZONES commands using DMA_BIDIRECTIONAL (read+write IOMMU mapping). This is done by introducing the helper function _base_scsi_dma_map() and using this helper in _base_build_sg_scmd() and _base_build_sg_scmd_ieee() instead of calling directly scsi_dma_map(). Fixes: 471ef9d4e498 ("mpt3sas: Build MPI SGL LIST on GEN2 HBAs and IEEE SGL LIST on GEN3 HBAs") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240719073913.179559-3-dlemoal@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 1092497563b2..c8fb965a6bf0 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -2671,6 +2671,22 @@ _base_build_zero_len_sge_ieee(struct MPT3SAS_ADAPTER *ioc, void *paddr) _base_add_sg_single_ieee(paddr, sgl_flags, 0, 0, -1); } +static inline int _base_scsi_dma_map(struct scsi_cmnd *cmd) +{ + /* + * Some firmware versions byte-swap the REPORT ZONES command reply from + * ATA-ZAC devices by directly accessing in the host buffer. This does + * not respect the default command DMA direction and causes IOMMU page + * faults on some architectures with an IOMMU enforcing write mappings + * (e.g. AMD hosts). Avoid such issue by making the report zones buffer + * mapping bi-directional. + */ + if (cmd->cmnd[0] == ZBC_IN && cmd->cmnd[1] == ZI_REPORT_ZONES) + cmd->sc_data_direction = DMA_BIDIRECTIONAL; + + return scsi_dma_map(cmd); +} + /** * _base_build_sg_scmd - main sg creation routine * pcie_device is unused here! @@ -2717,7 +2733,7 @@ _base_build_sg_scmd(struct MPT3SAS_ADAPTER *ioc, sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; sg_scmd = scsi_sglist(scmd); - sges_left = scsi_dma_map(scmd); + sges_left = _base_scsi_dma_map(scmd); if (sges_left < 0) return -ENOMEM; @@ -2861,7 +2877,7 @@ _base_build_sg_scmd_ieee(struct MPT3SAS_ADAPTER *ioc, } sg_scmd = scsi_sglist(scmd); - sges_left = scsi_dma_map(scmd); + sges_left = _base_scsi_dma_map(scmd); if (sges_left < 0) return -ENOMEM; From 47398f49dab8326bb652fa2d7a51ae5ec78775b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 21 Jul 2024 11:38:40 -0700 Subject: [PATCH 0053/1052] scsi: ufs: exynos: Don't resume FMP when crypto support is disabled If exynos_ufs_fmp_init() did not enable FMP support, then exynos_ufs_fmp_resume() should not execute the FMP-related SMC calls. Fixes: c96499fcb403 ("scsi: ufs: exynos: Add support for Flash Memory Protector (FMP)") Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240721183840.209284-1-ebiggers@kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-exynos.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ufs/host/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c index 16ad3528d80b..9ec318ef52bf 100644 --- a/drivers/ufs/host/ufs-exynos.c +++ b/drivers/ufs/host/ufs-exynos.c @@ -1293,6 +1293,9 @@ static void exynos_ufs_fmp_resume(struct ufs_hba *hba) { struct arm_smccc_res res; + if (!(hba->caps & UFSHCD_CAP_CRYPTO)) + return; + arm_smccc_smc(SMC_CMD_FMP_SECURITY, 0, SMU_EMBEDDED, CFG_DESCTYPE_3, 0, 0, 0, 0, &res); if (res.a0) From 8031b001da700474c11d28629581480b12a0d8d4 Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Thu, 18 Jul 2024 16:46:16 +0530 Subject: [PATCH 0054/1052] HID: amd_sfh: Move sensor discovery before HID device initialization Sensors discovery is independent of HID device initialization. If sensor discovery fails after HID initialization, then the HID device needs to be deinitialized. Therefore, sensors discovery should be moved before HID device initialization. Fixes: 7bcfdab3f0c6 ("HID: amd_sfh: if no sensors are enabled, clean up") Tested-by: Aurinko Signed-off-by: Basavaraj Natikar Link: https://patch.msgid.link/20240718111616.3012155-1-Basavaraj.Natikar@amd.com Signed-off-by: Benjamin Tissoires --- drivers/hid/amd-sfh-hid/amd_sfh_client.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c index bdb578e0899f..4b59687ff5d8 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c @@ -288,12 +288,22 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) mp2_ops->start(privdata, info); cl_data->sensor_sts[i] = amd_sfh_wait_for_response (privdata, cl_data->sensor_idx[i], SENSOR_ENABLED); + + if (cl_data->sensor_sts[i] == SENSOR_ENABLED) + cl_data->is_any_sensor_enabled = true; + } + + if (!cl_data->is_any_sensor_enabled || + (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0)) { + dev_warn(dev, "Failed to discover, sensors not enabled is %d\n", + cl_data->is_any_sensor_enabled); + rc = -EOPNOTSUPP; + goto cleanup; } for (i = 0; i < cl_data->num_hid_devices; i++) { cl_data->cur_hid_dev = i; if (cl_data->sensor_sts[i] == SENSOR_ENABLED) { - cl_data->is_any_sensor_enabled = true; rc = amdtp_hid_probe(i, cl_data); if (rc) goto cleanup; @@ -305,12 +315,6 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) cl_data->sensor_sts[i]); } - if (!cl_data->is_any_sensor_enabled || - (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0)) { - dev_warn(dev, "Failed to discover, sensors not enabled is %d\n", cl_data->is_any_sensor_enabled); - rc = -EOPNOTSUPP; - goto cleanup; - } schedule_delayed_work(&cl_data->work_buffer, msecs_to_jiffies(AMD_SFH_IDLE_LOOP)); return 0; From ff9fbcafbaf13346c742c0d672a22f5ac20b9d92 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:51 +0200 Subject: [PATCH 0055/1052] selftests/hid: fix bpf_wq new API Since commit f56f4d541eab ("bpf: helpers: fix bpf_wq_set_callback_impl signature"), the API for bpf_wq changed a bit. We need to update the selftests/hid code to reflect that or the bpf program will not load. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-1-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/progs/hid.c | 2 +- tools/testing/selftests/hid/progs/hid_bpf_helpers.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c index ee9bbbcf751b..5ecc845ef792 100644 --- a/tools/testing/selftests/hid/progs/hid.c +++ b/tools/testing/selftests/hid/progs/hid.c @@ -455,7 +455,7 @@ struct { __type(value, struct elem); } hmap SEC(".maps"); -static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +static int wq_cb_sleepable(void *map, int *key, void *work) { __u8 buf[9] = {2, 3, 4, 5, 6, 7, 8, 9, 10}; struct hid_bpf_ctx *hid_ctx; diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index cfe37f491906..e5db897586bb 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -114,7 +114,7 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + int (callback_fn)(void *map, int *key, void *wq), unsigned int flags__k, void *aux__ign) __ksym; #define bpf_wq_set_callback(timer, cb, flags) \ bpf_wq_set_callback_impl(timer, cb, flags, NULL) From f64c1a4593391c57accf32693a14ef45f8162b5c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:52 +0200 Subject: [PATCH 0056/1052] selftests/hid: disable struct_ops auto-attach Since commit 08ac454e258e ("libbpf: Auto-attach struct_ops BPF maps in BPF skeleton"), libbpf automatically calls bpf_map__attach_struct_ops() on every struct_ops it sees in the bpf object. The problem is that our test bpf object has many of them but only one should be manually loaded at a time, or we end up locking the syscall. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-2-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hid_bpf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index dc0408a831d0..9c935fd0dffc 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -532,6 +532,7 @@ static void load_programs(const struct test_program programs[], FIXTURE_DATA(hid_bpf) * self, const FIXTURE_VARIANT(hid_bpf) * variant) { + struct bpf_map *iter_map; int err = -EINVAL; ASSERT_LE(progs_count, ARRAY_SIZE(self->hid_links)) @@ -564,6 +565,13 @@ static void load_programs(const struct test_program programs[], *ops_hid_id = self->hid_id; } + /* we disable the auto-attach feature of all maps because we + * only want the tested one to be manually attached in the next + * call to bpf_map__attach_struct_ops() + */ + bpf_object__for_each_map(iter_map, *self->skel->skeleton->obj) + bpf_map__set_autoattach(iter_map, false); + err = hid__load(self->skel); ASSERT_OK(err) TH_LOG("hid_skel_load failed: %d", err); From acd34cfc48b3dd46e5e4c4bdc99cc0c15568bac0 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:53 +0200 Subject: [PATCH 0057/1052] HID: bpf: prevent the same struct_ops to be attached more than once If the struct_ops is already attached, we should bail out or we will end up in various locks and pointer issues while unregistering. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-3-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- drivers/hid/bpf/hid_bpf_struct_ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/bpf/hid_bpf_struct_ops.c b/drivers/hid/bpf/hid_bpf_struct_ops.c index f59cce6e437f..cd696c59ba0f 100644 --- a/drivers/hid/bpf/hid_bpf_struct_ops.c +++ b/drivers/hid/bpf/hid_bpf_struct_ops.c @@ -183,6 +183,10 @@ static int hid_bpf_reg(void *kdata, struct bpf_link *link) struct hid_device *hdev; int count, err = 0; + /* prevent multiple attach of the same struct_ops */ + if (ops->hdev) + return -EINVAL; + hdev = hid_get_device(ops->hid_id); if (IS_ERR(hdev)) return PTR_ERR(hdev); @@ -248,6 +252,7 @@ static void hid_bpf_unreg(void *kdata, struct bpf_link *link) list_del_rcu(&ops->list); synchronize_srcu(&hdev->bpf.srcu); + ops->hdev = NULL; reconnect = hdev->bpf.rdesc_ops == ops; if (reconnect) From facdbdfe0e6202d74758387ae9189c39f7b4b16c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:54 +0200 Subject: [PATCH 0058/1052] selftests/hid: add test for attaching multiple time the same struct_ops Turns out that we would en up in a bad state if we attempt to attach twice the same HID-BPF struct_ops, so have a test for it. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-4-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hid_bpf.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index 9c935fd0dffc..75b7b4ef6cfa 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -694,6 +694,24 @@ TEST_F(hid_bpf, subprog_raw_event) ASSERT_EQ(buf[2], 52); } +/* + * Attach hid_first_event to the given uhid device, + * attempt at re-attaching it, we should not lock and + * return an invalid struct bpf_link + */ +TEST_F(hid_bpf, multiple_attach) +{ + const struct test_program progs[] = { + { .name = "hid_first_event" }, + }; + struct bpf_link *link; + + LOAD_PROGRAMS(progs); + + link = bpf_map__attach_struct_ops(self->skel->maps.first_event); + ASSERT_NULL(link) TH_LOG("unexpected return value when re-attaching the struct_ops"); +} + /* * Ensures that we can attach/detach programs */ From 445d336cd15860f1efb441e6d694f829fbf679eb Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sun, 14 Jul 2024 23:50:09 +0300 Subject: [PATCH 0059/1052] drm/virtio: Fix type of dma-fence context variable Type of DMA fence context is u64. Fence-waiting code uses u32 for the context variable, fix it. Fixes: e4812ab8e6b1 ("drm/virtio: Refactor and optimize job submission code path") Cc: # v6.4+ Signed-off-by: Dmitry Osipenko Reviewed-by: Rob Clark Link: https://patchwork.freedesktop.org/patch/msgid/20240714205009.3408298-1-dmitry.osipenko@collabora.com --- drivers/gpu/drm/virtio/virtgpu_submit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_submit.c b/drivers/gpu/drm/virtio/virtgpu_submit.c index 1c7c7f61a222..7d34cf83f5f2 100644 --- a/drivers/gpu/drm/virtio/virtgpu_submit.c +++ b/drivers/gpu/drm/virtio/virtgpu_submit.c @@ -48,7 +48,7 @@ struct virtio_gpu_submit { static int virtio_gpu_do_fence_wait(struct virtio_gpu_submit *submit, struct dma_fence *in_fence) { - u32 context = submit->fence_ctx + submit->ring_idx; + u64 context = submit->fence_ctx + submit->ring_idx; if (dma_fence_match_context(in_fence, context)) return 0; From e58337100721f3cc0c7424a18730e4f39844934f Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Mon, 22 Jul 2024 14:41:13 -0400 Subject: [PATCH 0060/1052] drm/vmwgfx: Fix a deadlock in dma buf fence polling Introduce a version of the fence ops that on release doesn't remove the fence from the pending list, and thus doesn't require a lock to fix poll->fence wait->fence unref deadlocks. vmwgfx overwrites the wait callback to iterate over the list of all fences and update their status, to do that it holds a lock to prevent the list modifcations from other threads. The fence destroy callback both deletes the fence and removes it from the list of pending fences, for which it holds a lock. dma buf polling cb unrefs a fence after it's been signaled: so the poll calls the wait, which signals the fences, which are being destroyed. The destruction tries to acquire the lock on the pending fences list which it can never get because it's held by the wait from which it was called. Old bug, but not a lot of userspace apps were using dma-buf polling interfaces. Fix those, in particular this fixes KDE stalls/deadlock. Signed-off-by: Zack Rusin Fixes: 2298e804e96e ("drm/vmwgfx: rework to new fence interface, v2") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v6.2+ Reviewed-by: Maaz Mombasawala Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240722184313.181318-2-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c index 5efc6a766f64..588d50ababf6 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c @@ -32,7 +32,6 @@ #define VMW_FENCE_WRAP (1 << 31) struct vmw_fence_manager { - int num_fence_objects; struct vmw_private *dev_priv; spinlock_t lock; struct list_head fence_list; @@ -124,13 +123,13 @@ static void vmw_fence_obj_destroy(struct dma_fence *f) { struct vmw_fence_obj *fence = container_of(f, struct vmw_fence_obj, base); - struct vmw_fence_manager *fman = fman_from_fence(fence); - spin_lock(&fman->lock); - list_del_init(&fence->head); - --fman->num_fence_objects; - spin_unlock(&fman->lock); + if (!list_empty(&fence->head)) { + spin_lock(&fman->lock); + list_del_init(&fence->head); + spin_unlock(&fman->lock); + } fence->destroy(fence); } @@ -257,7 +256,6 @@ static const struct dma_fence_ops vmw_fence_ops = { .release = vmw_fence_obj_destroy, }; - /* * Execute signal actions on fences recently signaled. * This is done from a workqueue so we don't have to execute @@ -355,7 +353,6 @@ static int vmw_fence_obj_init(struct vmw_fence_manager *fman, goto out_unlock; } list_add_tail(&fence->head, &fman->fence_list); - ++fman->num_fence_objects; out_unlock: spin_unlock(&fman->lock); @@ -403,7 +400,7 @@ static bool vmw_fence_goal_new_locked(struct vmw_fence_manager *fman, u32 passed_seqno) { u32 goal_seqno; - struct vmw_fence_obj *fence; + struct vmw_fence_obj *fence, *next_fence; if (likely(!fman->seqno_valid)) return false; @@ -413,7 +410,7 @@ static bool vmw_fence_goal_new_locked(struct vmw_fence_manager *fman, return false; fman->seqno_valid = false; - list_for_each_entry(fence, &fman->fence_list, head) { + list_for_each_entry_safe(fence, next_fence, &fman->fence_list, head) { if (!list_empty(&fence->seq_passed_actions)) { fman->seqno_valid = true; vmw_fence_goal_write(fman->dev_priv, From 09f34a00272d2311f6e5d64ed8ad824ef78f7487 Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Mon, 22 Jul 2024 14:41:14 -0400 Subject: [PATCH 0061/1052] drm/vmwgfx: Make sure the screen surface is ref counted Fix races issues in virtual crc generation by making sure the surface the code uses for crc computation is properly ref counted. Crc generation was trying to be too clever by allowing the surfaces to go in and out of scope, with the hope of always having some kind of screen present. That's not always the code, in particular during atomic disable, so to make sure the surface, when present, is not being actively destroyed at the same time, hold a reference to it. Signed-off-by: Zack Rusin Fixes: 7b0062036c3b ("drm/vmwgfx: Implement virtual crc generation") Cc: Zack Rusin Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Reviewed-by: Maaz Mombasawala Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240722184313.181318-3-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c | 40 +++++++++++++++------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c index 7e93a45948f7..ac002048d8e5 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c @@ -76,7 +76,7 @@ vmw_surface_sync(struct vmw_private *vmw, return ret; } -static int +static void compute_crc(struct drm_crtc *crtc, struct vmw_surface *surf, u32 *crc) @@ -102,8 +102,6 @@ compute_crc(struct drm_crtc *crtc, } vmw_bo_unmap(bo); - - return 0; } static void @@ -117,7 +115,6 @@ crc_generate_worker(struct work_struct *work) u64 frame_start, frame_end; u32 crc32 = 0; struct vmw_surface *surf = 0; - int ret; spin_lock_irq(&du->vkms.crc_state_lock); crc_pending = du->vkms.crc_pending; @@ -131,22 +128,24 @@ crc_generate_worker(struct work_struct *work) return; spin_lock_irq(&du->vkms.crc_state_lock); - surf = du->vkms.surface; + surf = vmw_surface_reference(du->vkms.surface); spin_unlock_irq(&du->vkms.crc_state_lock); - if (vmw_surface_sync(vmw, surf)) { - drm_warn(crtc->dev, "CRC worker wasn't able to sync the crc surface!\n"); - return; - } + if (surf) { + if (vmw_surface_sync(vmw, surf)) { + drm_warn( + crtc->dev, + "CRC worker wasn't able to sync the crc surface!\n"); + return; + } - ret = compute_crc(crtc, surf, &crc32); - if (ret) - return; + compute_crc(crtc, surf, &crc32); + vmw_surface_unreference(&surf); + } spin_lock_irq(&du->vkms.crc_state_lock); frame_start = du->vkms.frame_start; frame_end = du->vkms.frame_end; - crc_pending = du->vkms.crc_pending; du->vkms.frame_start = 0; du->vkms.frame_end = 0; du->vkms.crc_pending = false; @@ -165,7 +164,7 @@ vmw_vkms_vblank_simulate(struct hrtimer *timer) struct vmw_display_unit *du = container_of(timer, struct vmw_display_unit, vkms.timer); struct drm_crtc *crtc = &du->crtc; struct vmw_private *vmw = vmw_priv(crtc->dev); - struct vmw_surface *surf = NULL; + bool has_surface = false; u64 ret_overrun; bool locked, ret; @@ -180,10 +179,10 @@ vmw_vkms_vblank_simulate(struct hrtimer *timer) WARN_ON(!ret); if (!locked) return HRTIMER_RESTART; - surf = du->vkms.surface; + has_surface = du->vkms.surface != NULL; vmw_vkms_unlock(crtc); - if (du->vkms.crc_enabled && surf) { + if (du->vkms.crc_enabled && has_surface) { u64 frame = drm_crtc_accurate_vblank_count(crtc); spin_lock(&du->vkms.crc_state_lock); @@ -337,6 +336,8 @@ vmw_vkms_crtc_cleanup(struct drm_crtc *crtc) { struct vmw_display_unit *du = vmw_crtc_to_du(crtc); + if (du->vkms.surface) + vmw_surface_unreference(&du->vkms.surface); WARN_ON(work_pending(&du->vkms.crc_generator_work)); hrtimer_cancel(&du->vkms.timer); } @@ -498,9 +499,12 @@ vmw_vkms_set_crc_surface(struct drm_crtc *crtc, struct vmw_display_unit *du = vmw_crtc_to_du(crtc); struct vmw_private *vmw = vmw_priv(crtc->dev); - if (vmw->vkms_enabled) { + if (vmw->vkms_enabled && du->vkms.surface != surf) { WARN_ON(atomic_read(&du->vkms.atomic_lock) != VMW_VKMS_LOCK_MODESET); - du->vkms.surface = surf; + if (du->vkms.surface) + vmw_surface_unreference(&du->vkms.surface); + if (surf) + du->vkms.surface = vmw_surface_reference(surf); } } From d6667f0ddf46c671d379cd5fe66ce0a54d2a743a Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Mon, 22 Jul 2024 14:41:15 -0400 Subject: [PATCH 0062/1052] drm/vmwgfx: Fix handling of dumb buffers Dumb buffers can be used in kms but also through prime with gallium's resource_from_handle. In the second case the dumb buffers can be rendered by the GPU where with the regular DRM kms interfaces they are mapped and written to by the CPU. Because the same buffer can be written to by the GPU and CPU vmwgfx needs to use vmw_surface (object which properly tracks dirty state of the guest and gpu memory) instead of vmw_bo (which is just guest side memory). Furthermore the dumb buffer handles are expected to be gem objects by a lot of userspace. Make vmwgfx accept gem handles in prime and kms but internally switch to vmw_surface's to properly track the dirty state of the objects between the GPU and CPU. Fixes new kwin and kde on wayland. Signed-off-by: Zack Rusin Fixes: b32233acceff ("drm/vmwgfx: Fix prime import/export") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v6.9+ Reviewed-by: Maaz Mombasawala Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240722184313.181318-4-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmw_surface_cache.h | 10 +- drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 127 +++--- drivers/gpu/drm/vmwgfx/vmwgfx_bo.h | 15 +- drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 40 +- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 504 +++++++++------------ drivers/gpu/drm/vmwgfx/vmwgfx_kms.h | 17 +- drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c | 14 +- drivers/gpu/drm/vmwgfx/vmwgfx_prime.c | 32 +- drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 27 +- drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c | 33 +- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 145 +++--- drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 280 +++++++++++- 12 files changed, 741 insertions(+), 503 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmw_surface_cache.h b/drivers/gpu/drm/vmwgfx/vmw_surface_cache.h index b0d87c5f58d8..1ac3cb151b11 100644 --- a/drivers/gpu/drm/vmwgfx/vmw_surface_cache.h +++ b/drivers/gpu/drm/vmwgfx/vmw_surface_cache.h @@ -1,6 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ /********************************************************** - * Copyright 2021 VMware, Inc. - * SPDX-License-Identifier: GPL-2.0 OR MIT + * + * Copyright (c) 2021-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -31,6 +33,10 @@ #include +#define SVGA3D_FLAGS_UPPER_32(svga3d_flags) ((svga3d_flags) >> 32) +#define SVGA3D_FLAGS_LOWER_32(svga3d_flags) \ + ((svga3d_flags) & ((uint64_t)U32_MAX)) + static inline u32 clamped_umul32(u32 a, u32 b) { uint64_t tmp = (uint64_t) a*b; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c index 00144632c600..f42ebc4a7c22 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /************************************************************************** * - * Copyright © 2011-2023 VMware, Inc., Palo Alto, CA., USA - * All Rights Reserved. + * Copyright (c) 2011-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -28,15 +28,39 @@ #include "vmwgfx_bo.h" #include "vmwgfx_drv.h" - +#include "vmwgfx_resource_priv.h" #include static void vmw_bo_release(struct vmw_bo *vbo) { + struct vmw_resource *res; + WARN_ON(vbo->tbo.base.funcs && kref_read(&vbo->tbo.base.refcount) != 0); vmw_bo_unmap(vbo); + + xa_destroy(&vbo->detached_resources); + WARN_ON(vbo->is_dumb && !vbo->dumb_surface); + if (vbo->is_dumb && vbo->dumb_surface) { + res = &vbo->dumb_surface->res; + WARN_ON(vbo != res->guest_memory_bo); + WARN_ON(!res->guest_memory_bo); + if (res->guest_memory_bo) { + /* Reserve and switch the backing mob. */ + mutex_lock(&res->dev_priv->cmdbuf_mutex); + (void)vmw_resource_reserve(res, false, true); + vmw_resource_mob_detach(res); + if (res->coherent) + vmw_bo_dirty_release(res->guest_memory_bo); + res->guest_memory_bo = NULL; + res->guest_memory_offset = 0; + vmw_resource_unreserve(res, false, false, false, NULL, + 0); + mutex_unlock(&res->dev_priv->cmdbuf_mutex); + } + vmw_surface_unreference(&vbo->dumb_surface); + } drm_gem_object_release(&vbo->tbo.base); } @@ -325,6 +349,11 @@ void vmw_bo_pin_reserved(struct vmw_bo *vbo, bool pin) * */ void *vmw_bo_map_and_cache(struct vmw_bo *vbo) +{ + return vmw_bo_map_and_cache_size(vbo, vbo->tbo.base.size); +} + +void *vmw_bo_map_and_cache_size(struct vmw_bo *vbo, size_t size) { struct ttm_buffer_object *bo = &vbo->tbo; bool not_used; @@ -335,9 +364,10 @@ void *vmw_bo_map_and_cache(struct vmw_bo *vbo) if (virtual) return virtual; - ret = ttm_bo_kmap(bo, 0, PFN_UP(bo->base.size), &vbo->map); + ret = ttm_bo_kmap(bo, 0, PFN_UP(size), &vbo->map); if (ret) - DRM_ERROR("Buffer object map failed: %d.\n", ret); + DRM_ERROR("Buffer object map failed: %d (size: bo = %zu, map = %zu).\n", + ret, bo->base.size, size); return ttm_kmap_obj_virtual(&vbo->map, ¬_used); } @@ -390,6 +420,7 @@ static int vmw_bo_init(struct vmw_private *dev_priv, BUILD_BUG_ON(TTM_MAX_BO_PRIORITY <= 3); vmw_bo->tbo.priority = 3; vmw_bo->res_tree = RB_ROOT; + xa_init(&vmw_bo->detached_resources); params->size = ALIGN(params->size, PAGE_SIZE); drm_gem_private_object_init(vdev, &vmw_bo->tbo.base, params->size); @@ -654,52 +685,6 @@ void vmw_bo_fence_single(struct ttm_buffer_object *bo, dma_fence_put(&fence->base); } - -/** - * vmw_dumb_create - Create a dumb kms buffer - * - * @file_priv: Pointer to a struct drm_file identifying the caller. - * @dev: Pointer to the drm device. - * @args: Pointer to a struct drm_mode_create_dumb structure - * Return: Zero on success, negative error code on failure. - * - * This is a driver callback for the core drm create_dumb functionality. - * Note that this is very similar to the vmw_bo_alloc ioctl, except - * that the arguments have a different format. - */ -int vmw_dumb_create(struct drm_file *file_priv, - struct drm_device *dev, - struct drm_mode_create_dumb *args) -{ - struct vmw_private *dev_priv = vmw_priv(dev); - struct vmw_bo *vbo; - int cpp = DIV_ROUND_UP(args->bpp, 8); - int ret; - - switch (cpp) { - case 1: /* DRM_FORMAT_C8 */ - case 2: /* DRM_FORMAT_RGB565 */ - case 4: /* DRM_FORMAT_XRGB8888 */ - break; - default: - /* - * Dumb buffers don't allow anything else. - * This is tested via IGT's dumb_buffers - */ - return -EINVAL; - } - - args->pitch = args->width * cpp; - args->size = ALIGN(args->pitch * args->height, PAGE_SIZE); - - ret = vmw_gem_object_create_with_handle(dev_priv, file_priv, - args->size, &args->handle, - &vbo); - /* drop reference from allocate - handle holds it now */ - drm_gem_object_put(&vbo->tbo.base); - return ret; -} - /** * vmw_bo_swap_notify - swapout notify callback. * @@ -853,3 +838,43 @@ void vmw_bo_placement_set_default_accelerated(struct vmw_bo *bo) vmw_bo_placement_set(bo, domain, domain); } + +void vmw_bo_add_detached_resource(struct vmw_bo *vbo, struct vmw_resource *res) +{ + xa_store(&vbo->detached_resources, (unsigned long)res, res, GFP_KERNEL); +} + +void vmw_bo_del_detached_resource(struct vmw_bo *vbo, struct vmw_resource *res) +{ + xa_erase(&vbo->detached_resources, (unsigned long)res); +} + +struct vmw_surface *vmw_bo_surface(struct vmw_bo *vbo) +{ + unsigned long index; + struct vmw_resource *res = NULL; + struct vmw_surface *surf = NULL; + struct rb_node *rb_itr = vbo->res_tree.rb_node; + + if (vbo->is_dumb && vbo->dumb_surface) { + res = &vbo->dumb_surface->res; + goto out; + } + + xa_for_each(&vbo->detached_resources, index, res) { + if (res->func->res_type == vmw_res_surface) + goto out; + } + + for (rb_itr = rb_first(&vbo->res_tree); rb_itr; + rb_itr = rb_next(rb_itr)) { + res = rb_entry(rb_itr, struct vmw_resource, mob_node); + if (res->func->res_type == vmw_res_surface) + goto out; + } + +out: + if (res) + surf = vmw_res_to_srf(res); + return surf; +} diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h index f349642e6190..62b4342d5f7c 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 OR MIT */ /************************************************************************** * - * Copyright 2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2023-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -35,11 +36,13 @@ #include #include +#include struct vmw_bo_dirty; struct vmw_fence_obj; struct vmw_private; struct vmw_resource; +struct vmw_surface; enum vmw_bo_domain { VMW_BO_DOMAIN_SYS = BIT(0), @@ -85,11 +88,15 @@ struct vmw_bo { struct rb_root res_tree; u32 res_prios[TTM_MAX_BO_PRIORITY]; + struct xarray detached_resources; atomic_t cpu_writers; /* Not ref-counted. Protected by binding_mutex */ struct vmw_resource *dx_query_ctx; struct vmw_bo_dirty *dirty; + + bool is_dumb; + struct vmw_surface *dumb_surface; }; void vmw_bo_placement_set(struct vmw_bo *bo, u32 domain, u32 busy_domain); @@ -124,15 +131,21 @@ void vmw_bo_fence_single(struct ttm_buffer_object *bo, struct vmw_fence_obj *fence); void *vmw_bo_map_and_cache(struct vmw_bo *vbo); +void *vmw_bo_map_and_cache_size(struct vmw_bo *vbo, size_t size); void vmw_bo_unmap(struct vmw_bo *vbo); void vmw_bo_move_notify(struct ttm_buffer_object *bo, struct ttm_resource *mem); void vmw_bo_swap_notify(struct ttm_buffer_object *bo); +void vmw_bo_add_detached_resource(struct vmw_bo *vbo, struct vmw_resource *res); +void vmw_bo_del_detached_resource(struct vmw_bo *vbo, struct vmw_resource *res); +struct vmw_surface *vmw_bo_surface(struct vmw_bo *vbo); + int vmw_user_bo_lookup(struct drm_file *filp, u32 handle, struct vmw_bo **out); + /** * vmw_bo_adjust_prio - Adjust the buffer object eviction priority * according to attached resources diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index a1ce41e1c468..32f50e595809 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 OR MIT */ /************************************************************************** * - * Copyright 2009-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2009-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -762,6 +763,26 @@ extern int vmw_gmr_bind(struct vmw_private *dev_priv, int gmr_id); extern void vmw_gmr_unbind(struct vmw_private *dev_priv, int gmr_id); +/** + * User handles + */ +struct vmw_user_object { + struct vmw_surface *surface; + struct vmw_bo *buffer; +}; + +int vmw_user_object_lookup(struct vmw_private *dev_priv, struct drm_file *filp, + u32 handle, struct vmw_user_object *uo); +struct vmw_user_object *vmw_user_object_ref(struct vmw_user_object *uo); +void vmw_user_object_unref(struct vmw_user_object *uo); +bool vmw_user_object_is_null(struct vmw_user_object *uo); +struct vmw_surface *vmw_user_object_surface(struct vmw_user_object *uo); +struct vmw_bo *vmw_user_object_buffer(struct vmw_user_object *uo); +void *vmw_user_object_map(struct vmw_user_object *uo); +void *vmw_user_object_map_size(struct vmw_user_object *uo, size_t size); +void vmw_user_object_unmap(struct vmw_user_object *uo); +bool vmw_user_object_is_mapped(struct vmw_user_object *uo); + /** * Resource utilities - vmwgfx_resource.c */ @@ -776,11 +797,6 @@ extern int vmw_resource_validate(struct vmw_resource *res, bool intr, extern int vmw_resource_reserve(struct vmw_resource *res, bool interruptible, bool no_backup); extern bool vmw_resource_needs_backup(const struct vmw_resource *res); -extern int vmw_user_lookup_handle(struct vmw_private *dev_priv, - struct drm_file *filp, - uint32_t handle, - struct vmw_surface **out_surf, - struct vmw_bo **out_buf); extern int vmw_user_resource_lookup_handle( struct vmw_private *dev_priv, struct ttm_object_file *tfile, @@ -1057,9 +1073,6 @@ int vmw_kms_suspend(struct drm_device *dev); int vmw_kms_resume(struct drm_device *dev); void vmw_kms_lost_device(struct drm_device *dev); -int vmw_dumb_create(struct drm_file *file_priv, - struct drm_device *dev, - struct drm_mode_create_dumb *args); extern int vmw_resource_pin(struct vmw_resource *res, bool interruptible); extern void vmw_resource_unpin(struct vmw_resource *res); extern enum vmw_res_type vmw_res_type(const struct vmw_resource *res); @@ -1176,6 +1189,15 @@ extern int vmw_gb_surface_reference_ext_ioctl(struct drm_device *dev, int vmw_gb_surface_define(struct vmw_private *dev_priv, const struct vmw_surface_metadata *req, struct vmw_surface **srf_out); +struct vmw_surface *vmw_lookup_surface_for_buffer(struct vmw_private *vmw, + struct vmw_bo *bo, + u32 handle); +u32 vmw_lookup_surface_handle_for_buffer(struct vmw_private *vmw, + struct vmw_bo *bo, + u32 handle); +int vmw_dumb_create(struct drm_file *file_priv, + struct drm_device *dev, + struct drm_mode_create_dumb *args); /* * Shader management - vmwgfx_shader.c diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 00c4ff684130..288ed0bb75cb 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /************************************************************************** * - * Copyright 2009-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2009-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -193,13 +194,16 @@ static u32 vmw_du_cursor_mob_size(u32 w, u32 h) */ static u32 *vmw_du_cursor_plane_acquire_image(struct vmw_plane_state *vps) { - if (vps->surf) { - if (vps->surf_mapped) - return vmw_bo_map_and_cache(vps->surf->res.guest_memory_bo); - return vps->surf->snooper.image; - } else if (vps->bo) - return vmw_bo_map_and_cache(vps->bo); - return NULL; + struct vmw_surface *surf; + + if (vmw_user_object_is_null(&vps->uo)) + return NULL; + + surf = vmw_user_object_surface(&vps->uo); + if (surf && !vmw_user_object_is_mapped(&vps->uo)) + return surf->snooper.image; + + return vmw_user_object_map(&vps->uo); } static bool vmw_du_cursor_plane_has_changed(struct vmw_plane_state *old_vps, @@ -536,21 +540,15 @@ void vmw_du_primary_plane_destroy(struct drm_plane *plane) * vmw_du_plane_unpin_surf - unpins resource associated with a framebuffer surface * * @vps: plane state associated with the display surface - * @unreference: true if we also want to unreference the display. */ -void vmw_du_plane_unpin_surf(struct vmw_plane_state *vps, - bool unreference) +void vmw_du_plane_unpin_surf(struct vmw_plane_state *vps) { - if (vps->surf) { - if (vps->pinned) { - vmw_resource_unpin(&vps->surf->res); - vps->pinned--; - } + struct vmw_surface *surf = vmw_user_object_surface(&vps->uo); - if (unreference) { - if (vps->pinned) - DRM_ERROR("Surface still pinned\n"); - vmw_surface_unreference(&vps->surf); + if (surf) { + if (vps->pinned) { + vmw_resource_unpin(&surf->res); + vps->pinned--; } } } @@ -572,7 +570,7 @@ vmw_du_plane_cleanup_fb(struct drm_plane *plane, { struct vmw_plane_state *vps = vmw_plane_state_to_vps(old_state); - vmw_du_plane_unpin_surf(vps, false); + vmw_du_plane_unpin_surf(vps); } @@ -661,25 +659,14 @@ vmw_du_cursor_plane_cleanup_fb(struct drm_plane *plane, struct vmw_cursor_plane *vcp = vmw_plane_to_vcp(plane); struct vmw_plane_state *vps = vmw_plane_state_to_vps(old_state); - if (vps->surf_mapped) { - vmw_bo_unmap(vps->surf->res.guest_memory_bo); - vps->surf_mapped = false; - } + if (!vmw_user_object_is_null(&vps->uo)) + vmw_user_object_unmap(&vps->uo); vmw_du_cursor_plane_unmap_cm(vps); vmw_du_put_cursor_mob(vcp, vps); - vmw_du_plane_unpin_surf(vps, false); - - if (vps->surf) { - vmw_surface_unreference(&vps->surf); - vps->surf = NULL; - } - - if (vps->bo) { - vmw_bo_unreference(&vps->bo); - vps->bo = NULL; - } + vmw_du_plane_unpin_surf(vps); + vmw_user_object_unref(&vps->uo); } @@ -698,64 +685,48 @@ vmw_du_cursor_plane_prepare_fb(struct drm_plane *plane, struct drm_framebuffer *fb = new_state->fb; struct vmw_cursor_plane *vcp = vmw_plane_to_vcp(plane); struct vmw_plane_state *vps = vmw_plane_state_to_vps(new_state); + struct vmw_bo *bo = NULL; int ret = 0; - if (vps->surf) { - if (vps->surf_mapped) { - vmw_bo_unmap(vps->surf->res.guest_memory_bo); - vps->surf_mapped = false; - } - vmw_surface_unreference(&vps->surf); - vps->surf = NULL; - } - - if (vps->bo) { - vmw_bo_unreference(&vps->bo); - vps->bo = NULL; + if (!vmw_user_object_is_null(&vps->uo)) { + vmw_user_object_unmap(&vps->uo); + vmw_user_object_unref(&vps->uo); } if (fb) { if (vmw_framebuffer_to_vfb(fb)->bo) { - vps->bo = vmw_framebuffer_to_vfbd(fb)->buffer; - vmw_bo_reference(vps->bo); + vps->uo.buffer = vmw_framebuffer_to_vfbd(fb)->buffer; + vps->uo.surface = NULL; } else { - vps->surf = vmw_framebuffer_to_vfbs(fb)->surface; - vmw_surface_reference(vps->surf); + memcpy(&vps->uo, &vmw_framebuffer_to_vfbs(fb)->uo, sizeof(vps->uo)); } + vmw_user_object_ref(&vps->uo); } - if (!vps->surf && vps->bo) { - const u32 size = new_state->crtc_w * new_state->crtc_h * sizeof(u32); + bo = vmw_user_object_buffer(&vps->uo); + if (bo) { + struct ttm_operation_ctx ctx = {false, false}; - /* - * Not using vmw_bo_map_and_cache() helper here as we need to - * reserve the ttm_buffer_object first which - * vmw_bo_map_and_cache() omits. - */ - ret = ttm_bo_reserve(&vps->bo->tbo, true, false, NULL); - - if (unlikely(ret != 0)) + ret = ttm_bo_reserve(&bo->tbo, true, false, NULL); + if (ret != 0) return -ENOMEM; - ret = ttm_bo_kmap(&vps->bo->tbo, 0, PFN_UP(size), &vps->bo->map); - - ttm_bo_unreserve(&vps->bo->tbo); - - if (unlikely(ret != 0)) + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + if (ret != 0) return -ENOMEM; - } else if (vps->surf && !vps->bo && vps->surf->res.guest_memory_bo) { - WARN_ON(vps->surf->snooper.image); - ret = ttm_bo_reserve(&vps->surf->res.guest_memory_bo->tbo, true, false, - NULL); - if (unlikely(ret != 0)) - return -ENOMEM; - vmw_bo_map_and_cache(vps->surf->res.guest_memory_bo); - ttm_bo_unreserve(&vps->surf->res.guest_memory_bo->tbo); - vps->surf_mapped = true; + vmw_bo_pin_reserved(bo, true); + if (vmw_framebuffer_to_vfb(fb)->bo) { + const u32 size = new_state->crtc_w * new_state->crtc_h * sizeof(u32); + + (void)vmw_bo_map_and_cache_size(bo, size); + } else { + vmw_bo_map_and_cache(bo); + } + ttm_bo_unreserve(&bo->tbo); } - if (vps->surf || vps->bo) { + if (!vmw_user_object_is_null(&vps->uo)) { vmw_du_get_cursor_mob(vcp, vps); vmw_du_cursor_plane_map_cm(vps); } @@ -777,14 +748,17 @@ vmw_du_cursor_plane_atomic_update(struct drm_plane *plane, struct vmw_display_unit *du = vmw_crtc_to_du(crtc); struct vmw_plane_state *vps = vmw_plane_state_to_vps(new_state); struct vmw_plane_state *old_vps = vmw_plane_state_to_vps(old_state); + struct vmw_bo *old_bo = NULL; + struct vmw_bo *new_bo = NULL; s32 hotspot_x, hotspot_y; + int ret; hotspot_x = du->hotspot_x + new_state->hotspot_x; hotspot_y = du->hotspot_y + new_state->hotspot_y; - du->cursor_surface = vps->surf; + du->cursor_surface = vmw_user_object_surface(&vps->uo); - if (!vps->surf && !vps->bo) { + if (vmw_user_object_is_null(&vps->uo)) { vmw_cursor_update_position(dev_priv, false, 0, 0); return; } @@ -792,10 +766,26 @@ vmw_du_cursor_plane_atomic_update(struct drm_plane *plane, vps->cursor.hotspot_x = hotspot_x; vps->cursor.hotspot_y = hotspot_y; - if (vps->surf) { + if (du->cursor_surface) du->cursor_age = du->cursor_surface->snooper.age; + + if (!vmw_user_object_is_null(&old_vps->uo)) { + old_bo = vmw_user_object_buffer(&old_vps->uo); + ret = ttm_bo_reserve(&old_bo->tbo, false, false, NULL); + if (ret != 0) + return; } + if (!vmw_user_object_is_null(&vps->uo)) { + new_bo = vmw_user_object_buffer(&vps->uo); + if (old_bo != new_bo) { + ret = ttm_bo_reserve(&new_bo->tbo, false, false, NULL); + if (ret != 0) + return; + } else { + new_bo = NULL; + } + } if (!vmw_du_cursor_plane_has_changed(old_vps, vps)) { /* * If it hasn't changed, avoid making the device do extra @@ -813,6 +803,11 @@ vmw_du_cursor_plane_atomic_update(struct drm_plane *plane, hotspot_x, hotspot_y); } + if (old_bo) + ttm_bo_unreserve(&old_bo->tbo); + if (new_bo) + ttm_bo_unreserve(&new_bo->tbo); + du->cursor_x = new_state->crtc_x + du->set_gui_x; du->cursor_y = new_state->crtc_y + du->set_gui_y; @@ -913,7 +908,7 @@ int vmw_du_cursor_plane_atomic_check(struct drm_plane *plane, } if (!vmw_framebuffer_to_vfb(fb)->bo) { - surface = vmw_framebuffer_to_vfbs(fb)->surface; + surface = vmw_user_object_surface(&vmw_framebuffer_to_vfbs(fb)->uo); WARN_ON(!surface); @@ -1074,12 +1069,7 @@ vmw_du_plane_duplicate_state(struct drm_plane *plane) memset(&vps->cursor, 0, sizeof(vps->cursor)); /* Each ref counted resource needs to be acquired again */ - if (vps->surf) - (void) vmw_surface_reference(vps->surf); - - if (vps->bo) - (void) vmw_bo_reference(vps->bo); - + vmw_user_object_ref(&vps->uo); state = &vps->base; __drm_atomic_helper_plane_duplicate_state(plane, state); @@ -1128,11 +1118,7 @@ vmw_du_plane_destroy_state(struct drm_plane *plane, struct vmw_plane_state *vps = vmw_plane_state_to_vps(state); /* Should have been freed by cleanup_fb */ - if (vps->surf) - vmw_surface_unreference(&vps->surf); - - if (vps->bo) - vmw_bo_unreference(&vps->bo); + vmw_user_object_unref(&vps->uo); drm_atomic_helper_plane_destroy_state(plane, state); } @@ -1227,7 +1213,7 @@ static void vmw_framebuffer_surface_destroy(struct drm_framebuffer *framebuffer) vmw_framebuffer_to_vfbs(framebuffer); drm_framebuffer_cleanup(framebuffer); - vmw_surface_unreference(&vfbs->surface); + vmw_user_object_unref(&vfbs->uo); kfree(vfbs); } @@ -1272,29 +1258,41 @@ int vmw_kms_readback(struct vmw_private *dev_priv, return -ENOSYS; } +static int vmw_framebuffer_surface_create_handle(struct drm_framebuffer *fb, + struct drm_file *file_priv, + unsigned int *handle) +{ + struct vmw_framebuffer_surface *vfbs = vmw_framebuffer_to_vfbs(fb); + struct vmw_bo *bo = vmw_user_object_buffer(&vfbs->uo); + + return drm_gem_handle_create(file_priv, &bo->tbo.base, handle); +} static const struct drm_framebuffer_funcs vmw_framebuffer_surface_funcs = { + .create_handle = vmw_framebuffer_surface_create_handle, .destroy = vmw_framebuffer_surface_destroy, .dirty = drm_atomic_helper_dirtyfb, }; static int vmw_kms_new_framebuffer_surface(struct vmw_private *dev_priv, - struct vmw_surface *surface, + struct vmw_user_object *uo, struct vmw_framebuffer **out, const struct drm_mode_fb_cmd2 - *mode_cmd, - bool is_bo_proxy) + *mode_cmd) { struct drm_device *dev = &dev_priv->drm; struct vmw_framebuffer_surface *vfbs; enum SVGA3dSurfaceFormat format; + struct vmw_surface *surface; int ret; /* 3D is only supported on HWv8 and newer hosts */ if (dev_priv->active_display_unit == vmw_du_legacy) return -ENOSYS; + surface = vmw_user_object_surface(uo); + /* * Sanity checks. */ @@ -1357,8 +1355,8 @@ static int vmw_kms_new_framebuffer_surface(struct vmw_private *dev_priv, } drm_helper_mode_fill_fb_struct(dev, &vfbs->base.base, mode_cmd); - vfbs->surface = vmw_surface_reference(surface); - vfbs->is_bo_proxy = is_bo_proxy; + memcpy(&vfbs->uo, uo, sizeof(vfbs->uo)); + vmw_user_object_ref(&vfbs->uo); *out = &vfbs->base; @@ -1370,7 +1368,7 @@ static int vmw_kms_new_framebuffer_surface(struct vmw_private *dev_priv, return 0; out_err2: - vmw_surface_unreference(&surface); + vmw_user_object_unref(&vfbs->uo); kfree(vfbs); out_err1: return ret; @@ -1386,7 +1384,6 @@ static int vmw_framebuffer_bo_create_handle(struct drm_framebuffer *fb, { struct vmw_framebuffer_bo *vfbd = vmw_framebuffer_to_vfbd(fb); - return drm_gem_handle_create(file_priv, &vfbd->buffer->tbo.base, handle); } @@ -1407,86 +1404,6 @@ static const struct drm_framebuffer_funcs vmw_framebuffer_bo_funcs = { .dirty = drm_atomic_helper_dirtyfb, }; -/** - * vmw_create_bo_proxy - create a proxy surface for the buffer object - * - * @dev: DRM device - * @mode_cmd: parameters for the new surface - * @bo_mob: MOB backing the buffer object - * @srf_out: newly created surface - * - * When the content FB is a buffer object, we create a surface as a proxy to the - * same buffer. This way we can do a surface copy rather than a surface DMA. - * This is a more efficient approach - * - * RETURNS: - * 0 on success, error code otherwise - */ -static int vmw_create_bo_proxy(struct drm_device *dev, - const struct drm_mode_fb_cmd2 *mode_cmd, - struct vmw_bo *bo_mob, - struct vmw_surface **srf_out) -{ - struct vmw_surface_metadata metadata = {0}; - uint32_t format; - struct vmw_resource *res; - unsigned int bytes_pp; - int ret; - - switch (mode_cmd->pixel_format) { - case DRM_FORMAT_ARGB8888: - case DRM_FORMAT_XRGB8888: - format = SVGA3D_X8R8G8B8; - bytes_pp = 4; - break; - - case DRM_FORMAT_RGB565: - case DRM_FORMAT_XRGB1555: - format = SVGA3D_R5G6B5; - bytes_pp = 2; - break; - - case 8: - format = SVGA3D_P8; - bytes_pp = 1; - break; - - default: - DRM_ERROR("Invalid framebuffer format %p4cc\n", - &mode_cmd->pixel_format); - return -EINVAL; - } - - metadata.format = format; - metadata.mip_levels[0] = 1; - metadata.num_sizes = 1; - metadata.base_size.width = mode_cmd->pitches[0] / bytes_pp; - metadata.base_size.height = mode_cmd->height; - metadata.base_size.depth = 1; - metadata.scanout = true; - - ret = vmw_gb_surface_define(vmw_priv(dev), &metadata, srf_out); - if (ret) { - DRM_ERROR("Failed to allocate proxy content buffer\n"); - return ret; - } - - res = &(*srf_out)->res; - - /* Reserve and switch the backing mob. */ - mutex_lock(&res->dev_priv->cmdbuf_mutex); - (void) vmw_resource_reserve(res, false, true); - vmw_user_bo_unref(&res->guest_memory_bo); - res->guest_memory_bo = vmw_user_bo_ref(bo_mob); - res->guest_memory_offset = 0; - vmw_resource_unreserve(res, false, false, false, NULL, 0); - mutex_unlock(&res->dev_priv->cmdbuf_mutex); - - return 0; -} - - - static int vmw_kms_new_framebuffer_bo(struct vmw_private *dev_priv, struct vmw_bo *bo, struct vmw_framebuffer **out, @@ -1565,55 +1482,24 @@ vmw_kms_srf_ok(struct vmw_private *dev_priv, uint32_t width, uint32_t height) * vmw_kms_new_framebuffer - Create a new framebuffer. * * @dev_priv: Pointer to device private struct. - * @bo: Pointer to buffer object to wrap the kms framebuffer around. - * Either @bo or @surface must be NULL. - * @surface: Pointer to a surface to wrap the kms framebuffer around. - * Either @bo or @surface must be NULL. - * @only_2d: No presents will occur to this buffer object based framebuffer. - * This helps the code to do some important optimizations. + * @uo: Pointer to user object to wrap the kms framebuffer around. + * Either the buffer or surface inside the user object must be NULL. * @mode_cmd: Frame-buffer metadata. */ struct vmw_framebuffer * vmw_kms_new_framebuffer(struct vmw_private *dev_priv, - struct vmw_bo *bo, - struct vmw_surface *surface, - bool only_2d, + struct vmw_user_object *uo, const struct drm_mode_fb_cmd2 *mode_cmd) { struct vmw_framebuffer *vfb = NULL; - bool is_bo_proxy = false; int ret; - /* - * We cannot use the SurfaceDMA command in an non-accelerated VM, - * therefore, wrap the buffer object in a surface so we can use the - * SurfaceCopy command. - */ - if (vmw_kms_srf_ok(dev_priv, mode_cmd->width, mode_cmd->height) && - bo && only_2d && - mode_cmd->width > 64 && /* Don't create a proxy for cursor */ - dev_priv->active_display_unit == vmw_du_screen_target) { - ret = vmw_create_bo_proxy(&dev_priv->drm, mode_cmd, - bo, &surface); - if (ret) - return ERR_PTR(ret); - - is_bo_proxy = true; - } - /* Create the new framebuffer depending one what we have */ - if (surface) { - ret = vmw_kms_new_framebuffer_surface(dev_priv, surface, &vfb, - mode_cmd, - is_bo_proxy); - /* - * vmw_create_bo_proxy() adds a reference that is no longer - * needed - */ - if (is_bo_proxy) - vmw_surface_unreference(&surface); - } else if (bo) { - ret = vmw_kms_new_framebuffer_bo(dev_priv, bo, &vfb, + if (vmw_user_object_surface(uo)) { + ret = vmw_kms_new_framebuffer_surface(dev_priv, uo, &vfb, + mode_cmd); + } else if (uo->buffer) { + ret = vmw_kms_new_framebuffer_bo(dev_priv, uo->buffer, &vfb, mode_cmd); } else { BUG(); @@ -1635,14 +1521,12 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, { struct vmw_private *dev_priv = vmw_priv(dev); struct vmw_framebuffer *vfb = NULL; - struct vmw_surface *surface = NULL; - struct vmw_bo *bo = NULL; + struct vmw_user_object uo = {0}; int ret; /* returns either a bo or surface */ - ret = vmw_user_lookup_handle(dev_priv, file_priv, - mode_cmd->handles[0], - &surface, &bo); + ret = vmw_user_object_lookup(dev_priv, file_priv, mode_cmd->handles[0], + &uo); if (ret) { DRM_ERROR("Invalid buffer object handle %u (0x%x).\n", mode_cmd->handles[0], mode_cmd->handles[0]); @@ -1650,7 +1534,7 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, } - if (!bo && + if (vmw_user_object_surface(&uo) && !vmw_kms_srf_ok(dev_priv, mode_cmd->width, mode_cmd->height)) { DRM_ERROR("Surface size cannot exceed %dx%d\n", dev_priv->texture_max_width, @@ -1659,20 +1543,15 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, } - vfb = vmw_kms_new_framebuffer(dev_priv, bo, surface, - !(dev_priv->capabilities & SVGA_CAP_3D), - mode_cmd); + vfb = vmw_kms_new_framebuffer(dev_priv, &uo, mode_cmd); if (IS_ERR(vfb)) { ret = PTR_ERR(vfb); goto err_out; } err_out: - /* vmw_user_lookup_handle takes one ref so does new_fb */ - if (bo) - vmw_user_bo_unref(&bo); - if (surface) - vmw_surface_unreference(&surface); + /* vmw_user_object_lookup takes one ref so does new_fb */ + vmw_user_object_unref(&uo); if (ret) { DRM_ERROR("failed to create vmw_framebuffer: %i\n", ret); @@ -2584,72 +2463,6 @@ void vmw_kms_helper_validation_finish(struct vmw_private *dev_priv, vmw_fence_obj_unreference(&fence); } -/** - * vmw_kms_update_proxy - Helper function to update a proxy surface from - * its backing MOB. - * - * @res: Pointer to the surface resource - * @clips: Clip rects in framebuffer (surface) space. - * @num_clips: Number of clips in @clips. - * @increment: Integer with which to increment the clip counter when looping. - * Used to skip a predetermined number of clip rects. - * - * This function makes sure the proxy surface is updated from its backing MOB - * using the region given by @clips. The surface resource @res and its backing - * MOB needs to be reserved and validated on call. - */ -int vmw_kms_update_proxy(struct vmw_resource *res, - const struct drm_clip_rect *clips, - unsigned num_clips, - int increment) -{ - struct vmw_private *dev_priv = res->dev_priv; - struct drm_vmw_size *size = &vmw_res_to_srf(res)->metadata.base_size; - struct { - SVGA3dCmdHeader header; - SVGA3dCmdUpdateGBImage body; - } *cmd; - SVGA3dBox *box; - size_t copy_size = 0; - int i; - - if (!clips) - return 0; - - cmd = VMW_CMD_RESERVE(dev_priv, sizeof(*cmd) * num_clips); - if (!cmd) - return -ENOMEM; - - for (i = 0; i < num_clips; ++i, clips += increment, ++cmd) { - box = &cmd->body.box; - - cmd->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE; - cmd->header.size = sizeof(cmd->body); - cmd->body.image.sid = res->id; - cmd->body.image.face = 0; - cmd->body.image.mipmap = 0; - - if (clips->x1 > size->width || clips->x2 > size->width || - clips->y1 > size->height || clips->y2 > size->height) { - DRM_ERROR("Invalid clips outsize of framebuffer.\n"); - return -EINVAL; - } - - box->x = clips->x1; - box->y = clips->y1; - box->z = 0; - box->w = clips->x2 - clips->x1; - box->h = clips->y2 - clips->y1; - box->d = 1; - - copy_size += sizeof(*cmd); - } - - vmw_cmd_commit(dev_priv, copy_size); - - return 0; -} - /** * vmw_kms_create_implicit_placement_property - Set up the implicit placement * property. @@ -2784,8 +2597,9 @@ int vmw_du_helper_plane_update(struct vmw_du_update_plane *update) } else { struct vmw_framebuffer_surface *vfbs = container_of(update->vfb, typeof(*vfbs), base); + struct vmw_surface *surf = vmw_user_object_surface(&vfbs->uo); - ret = vmw_validation_add_resource(&val_ctx, &vfbs->surface->res, + ret = vmw_validation_add_resource(&val_ctx, &surf->res, 0, VMW_RES_DIRTY_NONE, NULL, NULL); } @@ -2941,3 +2755,93 @@ int vmw_connector_get_modes(struct drm_connector *connector) return num_modes; } + +struct vmw_user_object *vmw_user_object_ref(struct vmw_user_object *uo) +{ + if (uo->buffer) + vmw_user_bo_ref(uo->buffer); + else if (uo->surface) + vmw_surface_reference(uo->surface); + return uo; +} + +void vmw_user_object_unref(struct vmw_user_object *uo) +{ + if (uo->buffer) + vmw_user_bo_unref(&uo->buffer); + else if (uo->surface) + vmw_surface_unreference(&uo->surface); +} + +struct vmw_bo * +vmw_user_object_buffer(struct vmw_user_object *uo) +{ + if (uo->buffer) + return uo->buffer; + else if (uo->surface) + return uo->surface->res.guest_memory_bo; + return NULL; +} + +struct vmw_surface * +vmw_user_object_surface(struct vmw_user_object *uo) +{ + if (uo->buffer) + return uo->buffer->dumb_surface; + return uo->surface; +} + +void *vmw_user_object_map(struct vmw_user_object *uo) +{ + struct vmw_bo *bo = vmw_user_object_buffer(uo); + + WARN_ON(!bo); + return vmw_bo_map_and_cache(bo); +} + +void *vmw_user_object_map_size(struct vmw_user_object *uo, size_t size) +{ + struct vmw_bo *bo = vmw_user_object_buffer(uo); + + WARN_ON(!bo); + return vmw_bo_map_and_cache_size(bo, size); +} + +void vmw_user_object_unmap(struct vmw_user_object *uo) +{ + struct vmw_bo *bo = vmw_user_object_buffer(uo); + int ret; + + WARN_ON(!bo); + + /* Fence the mob creation so we are guarateed to have the mob */ + ret = ttm_bo_reserve(&bo->tbo, false, false, NULL); + if (ret != 0) + return; + + vmw_bo_unmap(bo); + vmw_bo_pin_reserved(bo, false); + + ttm_bo_unreserve(&bo->tbo); +} + +bool vmw_user_object_is_mapped(struct vmw_user_object *uo) +{ + struct vmw_bo *bo; + + if (!uo || vmw_user_object_is_null(uo)) + return false; + + bo = vmw_user_object_buffer(uo); + + if (WARN_ON(!bo)) + return false; + + WARN_ON(bo->map.bo && !bo->map.virtual); + return bo->map.virtual; +} + +bool vmw_user_object_is_null(struct vmw_user_object *uo) +{ + return !uo->buffer && !uo->surface; +} diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h index bf24f2f0dcfc..6141fadf81ef 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 OR MIT */ /************************************************************************** * - * Copyright 2009-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2009-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -221,11 +222,9 @@ struct vmw_framebuffer { struct vmw_framebuffer_surface { struct vmw_framebuffer base; - struct vmw_surface *surface; - bool is_bo_proxy; /* true if this is proxy surface for DMA buf */ + struct vmw_user_object uo; }; - struct vmw_framebuffer_bo { struct vmw_framebuffer base; struct vmw_bo *buffer; @@ -277,8 +276,7 @@ struct vmw_cursor_plane_state { */ struct vmw_plane_state { struct drm_plane_state base; - struct vmw_surface *surf; - struct vmw_bo *bo; + struct vmw_user_object uo; int content_fb_type; unsigned long bo_size; @@ -457,9 +455,7 @@ int vmw_kms_readback(struct vmw_private *dev_priv, uint32_t num_clips); struct vmw_framebuffer * vmw_kms_new_framebuffer(struct vmw_private *dev_priv, - struct vmw_bo *bo, - struct vmw_surface *surface, - bool only_2d, + struct vmw_user_object *uo, const struct drm_mode_fb_cmd2 *mode_cmd); void vmw_guess_mode_timing(struct drm_display_mode *mode); void vmw_kms_update_implicit_fb(struct vmw_private *dev_priv); @@ -486,8 +482,7 @@ void vmw_du_plane_reset(struct drm_plane *plane); struct drm_plane_state *vmw_du_plane_duplicate_state(struct drm_plane *plane); void vmw_du_plane_destroy_state(struct drm_plane *plane, struct drm_plane_state *state); -void vmw_du_plane_unpin_surf(struct vmw_plane_state *vps, - bool unreference); +void vmw_du_plane_unpin_surf(struct vmw_plane_state *vps); int vmw_du_crtc_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c index 5befc2719a49..39949e0a493f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /************************************************************************** * - * Copyright 2009-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2009-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -147,8 +148,9 @@ static int vmw_ldu_fb_pin(struct vmw_framebuffer *vfb) struct vmw_bo *buf; int ret; - buf = vfb->bo ? vmw_framebuffer_to_vfbd(&vfb->base)->buffer : - vmw_framebuffer_to_vfbs(&vfb->base)->surface->res.guest_memory_bo; + buf = vfb->bo ? + vmw_framebuffer_to_vfbd(&vfb->base)->buffer : + vmw_user_object_buffer(&vmw_framebuffer_to_vfbs(&vfb->base)->uo); if (!buf) return 0; @@ -169,8 +171,10 @@ static int vmw_ldu_fb_unpin(struct vmw_framebuffer *vfb) struct vmw_private *dev_priv = vmw_priv(vfb->base.dev); struct vmw_bo *buf; - buf = vfb->bo ? vmw_framebuffer_to_vfbd(&vfb->base)->buffer : - vmw_framebuffer_to_vfbs(&vfb->base)->surface->res.guest_memory_bo; + buf = vfb->bo ? + vmw_framebuffer_to_vfbd(&vfb->base)->buffer : + vmw_user_object_buffer(&vmw_framebuffer_to_vfbs(&vfb->base)->uo); + if (WARN_ON(!buf)) return 0; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c index c99cad444991..598b90ac7590 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /************************************************************************** * - * Copyright 2013 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2013-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -31,6 +32,7 @@ */ #include "vmwgfx_drv.h" +#include "vmwgfx_bo.h" #include "ttm_object.h" #include @@ -88,13 +90,35 @@ int vmw_prime_handle_to_fd(struct drm_device *dev, uint32_t handle, uint32_t flags, int *prime_fd) { + struct vmw_private *vmw = vmw_priv(dev); struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; + struct vmw_bo *vbo; int ret; + int surf_handle; - if (handle > VMWGFX_NUM_MOB) + if (handle > VMWGFX_NUM_MOB) { ret = ttm_prime_handle_to_fd(tfile, handle, flags, prime_fd); - else - ret = drm_gem_prime_handle_to_fd(dev, file_priv, handle, flags, prime_fd); + } else { + ret = vmw_user_bo_lookup(file_priv, handle, &vbo); + if (ret) + return ret; + if (vbo && vbo->is_dumb) { + ret = drm_gem_prime_handle_to_fd(dev, file_priv, handle, + flags, prime_fd); + } else { + surf_handle = vmw_lookup_surface_handle_for_buffer(vmw, + vbo, + handle); + if (surf_handle > 0) + ret = ttm_prime_handle_to_fd(tfile, surf_handle, + flags, prime_fd); + else + ret = drm_gem_prime_handle_to_fd(dev, file_priv, + handle, flags, + prime_fd); + } + vmw_user_bo_unref(&vbo); + } return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c index 848dba09981b..a73af8a355fb 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /************************************************************************** * - * Copyright 2009-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2009-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -58,6 +59,7 @@ void vmw_resource_mob_attach(struct vmw_resource *res) rb_link_node(&res->mob_node, parent, new); rb_insert_color(&res->mob_node, &gbo->res_tree); + vmw_bo_del_detached_resource(gbo, res); vmw_bo_prio_add(gbo, res->used_prio); } @@ -287,28 +289,35 @@ int vmw_user_resource_lookup_handle(struct vmw_private *dev_priv, * * The pointer this pointed at by out_surf and out_buf needs to be null. */ -int vmw_user_lookup_handle(struct vmw_private *dev_priv, +int vmw_user_object_lookup(struct vmw_private *dev_priv, struct drm_file *filp, - uint32_t handle, - struct vmw_surface **out_surf, - struct vmw_bo **out_buf) + u32 handle, + struct vmw_user_object *uo) { struct ttm_object_file *tfile = vmw_fpriv(filp)->tfile; struct vmw_resource *res; int ret; - BUG_ON(*out_surf || *out_buf); + WARN_ON(uo->surface || uo->buffer); ret = vmw_user_resource_lookup_handle(dev_priv, tfile, handle, user_surface_converter, &res); if (!ret) { - *out_surf = vmw_res_to_srf(res); + uo->surface = vmw_res_to_srf(res); return 0; } - *out_surf = NULL; - ret = vmw_user_bo_lookup(filp, handle, out_buf); + uo->surface = NULL; + ret = vmw_user_bo_lookup(filp, handle, &uo->buffer); + if (!ret && !uo->buffer->is_dumb) { + uo->surface = vmw_lookup_surface_for_buffer(dev_priv, + uo->buffer, + handle); + if (uo->surface) + vmw_user_bo_unref(&uo->buffer); + } + return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c index df0039a8ef29..0f4bfd98480a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /************************************************************************** * - * Copyright 2011-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2011-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -240,7 +241,7 @@ static void vmw_sou_crtc_mode_set_nofb(struct drm_crtc *crtc) struct vmw_connector_state *vmw_conn_state; int x, y; - sou->buffer = vps->bo; + sou->buffer = vmw_user_object_buffer(&vps->uo); conn_state = sou->base.connector.state; vmw_conn_state = vmw_connector_state_to_vcs(conn_state); @@ -376,10 +377,11 @@ vmw_sou_primary_plane_cleanup_fb(struct drm_plane *plane, struct vmw_plane_state *vps = vmw_plane_state_to_vps(old_state); struct drm_crtc *crtc = plane->state->crtc ? plane->state->crtc : old_state->crtc; + struct vmw_bo *bo = vmw_user_object_buffer(&vps->uo); - if (vps->bo) - vmw_bo_unpin(vmw_priv(crtc->dev), vps->bo, false); - vmw_bo_unreference(&vps->bo); + if (bo) + vmw_bo_unpin(vmw_priv(crtc->dev), bo, false); + vmw_user_object_unref(&vps->uo); vps->bo_size = 0; vmw_du_plane_cleanup_fb(plane, old_state); @@ -411,9 +413,10 @@ vmw_sou_primary_plane_prepare_fb(struct drm_plane *plane, .bo_type = ttm_bo_type_device, .pin = true }; + struct vmw_bo *bo = NULL; if (!new_fb) { - vmw_bo_unreference(&vps->bo); + vmw_user_object_unref(&vps->uo); vps->bo_size = 0; return 0; @@ -422,17 +425,17 @@ vmw_sou_primary_plane_prepare_fb(struct drm_plane *plane, bo_params.size = new_state->crtc_w * new_state->crtc_h * 4; dev_priv = vmw_priv(crtc->dev); - if (vps->bo) { + bo = vmw_user_object_buffer(&vps->uo); + if (bo) { if (vps->bo_size == bo_params.size) { /* * Note that this might temporarily up the pin-count * to 2, until cleanup_fb() is called. */ - return vmw_bo_pin_in_vram(dev_priv, vps->bo, - true); + return vmw_bo_pin_in_vram(dev_priv, bo, true); } - vmw_bo_unreference(&vps->bo); + vmw_user_object_unref(&vps->uo); vps->bo_size = 0; } @@ -442,7 +445,7 @@ vmw_sou_primary_plane_prepare_fb(struct drm_plane *plane, * resume the overlays, this is preferred to failing to alloc. */ vmw_overlay_pause_all(dev_priv); - ret = vmw_bo_create(dev_priv, &bo_params, &vps->bo); + ret = vmw_gem_object_create(dev_priv, &bo_params, &vps->uo.buffer); vmw_overlay_resume_all(dev_priv); if (ret) return ret; @@ -453,7 +456,7 @@ vmw_sou_primary_plane_prepare_fb(struct drm_plane *plane, * TTM already thinks the buffer is pinned, but make sure the * pin_count is upped. */ - return vmw_bo_pin_in_vram(dev_priv, vps->bo, true); + return vmw_bo_pin_in_vram(dev_priv, vps->uo.buffer, true); } static uint32_t vmw_sou_bo_fifo_size(struct vmw_du_update_plane *update, @@ -580,6 +583,7 @@ static uint32_t vmw_sou_surface_pre_clip(struct vmw_du_update_plane *update, { struct vmw_kms_sou_dirty_cmd *blit = cmd; struct vmw_framebuffer_surface *vfbs; + struct vmw_surface *surf = NULL; vfbs = container_of(update->vfb, typeof(*vfbs), base); @@ -587,7 +591,8 @@ static uint32_t vmw_sou_surface_pre_clip(struct vmw_du_update_plane *update, blit->header.size = sizeof(blit->body) + sizeof(SVGASignedRect) * num_hits; - blit->body.srcImage.sid = vfbs->surface->res.id; + surf = vmw_user_object_surface(&vfbs->uo); + blit->body.srcImage.sid = surf->res.id; blit->body.destScreenId = update->du->unit; /* Update the source and destination bounding box later in post_clip */ @@ -1104,7 +1109,7 @@ int vmw_kms_sou_do_surface_dirty(struct vmw_private *dev_priv, int ret; if (!srf) - srf = &vfbs->surface->res; + srf = &vmw_user_object_surface(&vfbs->uo)->res; ret = vmw_validation_add_resource(&val_ctx, srf, 0, VMW_RES_DIRTY_NONE, NULL, NULL); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index a04e0736318d..5106413c14b7 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /****************************************************************************** * - * COPYRIGHT (C) 2014-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2014-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -29,6 +30,7 @@ #include "vmwgfx_kms.h" #include "vmwgfx_vkms.h" #include "vmw_surface_cache.h" +#include #include #include @@ -735,7 +737,7 @@ int vmw_kms_stdu_surface_dirty(struct vmw_private *dev_priv, int ret; if (!srf) - srf = &vfbs->surface->res; + srf = &vmw_user_object_surface(&vfbs->uo)->res; ret = vmw_validation_add_resource(&val_ctx, srf, 0, VMW_RES_DIRTY_NONE, NULL, NULL); @@ -746,12 +748,6 @@ int vmw_kms_stdu_surface_dirty(struct vmw_private *dev_priv, if (ret) goto out_unref; - if (vfbs->is_bo_proxy) { - ret = vmw_kms_update_proxy(srf, clips, num_clips, inc); - if (ret) - goto out_finish; - } - sdirty.base.fifo_commit = vmw_kms_stdu_surface_fifo_commit; sdirty.base.clip = vmw_kms_stdu_surface_clip; sdirty.base.fifo_reserve_size = sizeof(struct vmw_stdu_surface_copy) + @@ -765,7 +761,7 @@ int vmw_kms_stdu_surface_dirty(struct vmw_private *dev_priv, ret = vmw_kms_helper_dirty(dev_priv, framebuffer, clips, vclips, dest_x, dest_y, num_clips, inc, &sdirty.base); -out_finish: + vmw_kms_helper_validation_finish(dev_priv, NULL, &val_ctx, out_fence, NULL); @@ -918,9 +914,8 @@ vmw_stdu_primary_plane_cleanup_fb(struct drm_plane *plane, { struct vmw_plane_state *vps = vmw_plane_state_to_vps(old_state); - if (vps->surf) + if (vmw_user_object_surface(&vps->uo)) WARN_ON(!vps->pinned); - vmw_du_plane_cleanup_fb(plane, old_state); vps->content_fb_type = SAME_AS_DISPLAY; @@ -928,7 +923,6 @@ vmw_stdu_primary_plane_cleanup_fb(struct drm_plane *plane, } - /** * vmw_stdu_primary_plane_prepare_fb - Readies the display surface * @@ -952,13 +946,15 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, enum stdu_content_type new_content_type; struct vmw_framebuffer_surface *new_vfbs; uint32_t hdisplay = new_state->crtc_w, vdisplay = new_state->crtc_h; + struct drm_plane_state *old_state = plane->state; + struct drm_rect rect; int ret; /* No FB to prepare */ if (!new_fb) { - if (vps->surf) { + if (vmw_user_object_surface(&vps->uo)) { WARN_ON(vps->pinned != 0); - vmw_surface_unreference(&vps->surf); + vmw_user_object_unref(&vps->uo); } return 0; @@ -968,8 +964,8 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, new_vfbs = (vfb->bo) ? NULL : vmw_framebuffer_to_vfbs(new_fb); if (new_vfbs && - new_vfbs->surface->metadata.base_size.width == hdisplay && - new_vfbs->surface->metadata.base_size.height == vdisplay) + vmw_user_object_surface(&new_vfbs->uo)->metadata.base_size.width == hdisplay && + vmw_user_object_surface(&new_vfbs->uo)->metadata.base_size.height == vdisplay) new_content_type = SAME_AS_DISPLAY; else if (vfb->bo) new_content_type = SEPARATE_BO; @@ -1007,29 +1003,29 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, metadata.num_sizes = 1; metadata.scanout = true; } else { - metadata = new_vfbs->surface->metadata; + metadata = vmw_user_object_surface(&new_vfbs->uo)->metadata; } metadata.base_size.width = hdisplay; metadata.base_size.height = vdisplay; metadata.base_size.depth = 1; - if (vps->surf) { + if (vmw_user_object_surface(&vps->uo)) { struct drm_vmw_size cur_base_size = - vps->surf->metadata.base_size; + vmw_user_object_surface(&vps->uo)->metadata.base_size; if (cur_base_size.width != metadata.base_size.width || cur_base_size.height != metadata.base_size.height || - vps->surf->metadata.format != metadata.format) { + vmw_user_object_surface(&vps->uo)->metadata.format != metadata.format) { WARN_ON(vps->pinned != 0); - vmw_surface_unreference(&vps->surf); + vmw_user_object_unref(&vps->uo); } } - if (!vps->surf) { + if (!vmw_user_object_surface(&vps->uo)) { ret = vmw_gb_surface_define(dev_priv, &metadata, - &vps->surf); + &vps->uo.surface); if (ret != 0) { DRM_ERROR("Couldn't allocate STDU surface.\n"); return ret; @@ -1042,18 +1038,19 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, * The only time we add a reference in prepare_fb is if the * state object doesn't have a reference to begin with */ - if (vps->surf) { + if (vmw_user_object_surface(&vps->uo)) { WARN_ON(vps->pinned != 0); - vmw_surface_unreference(&vps->surf); + vmw_user_object_unref(&vps->uo); } - vps->surf = vmw_surface_reference(new_vfbs->surface); + memcpy(&vps->uo, &new_vfbs->uo, sizeof(vps->uo)); + vmw_user_object_ref(&vps->uo); } - if (vps->surf) { + if (vmw_user_object_surface(&vps->uo)) { /* Pin new surface before flipping */ - ret = vmw_resource_pin(&vps->surf->res, false); + ret = vmw_resource_pin(&vmw_user_object_surface(&vps->uo)->res, false); if (ret) goto out_srf_unref; @@ -1062,6 +1059,34 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, vps->content_fb_type = new_content_type; + /* + * The drm fb code will do blit's via the vmap interface, which doesn't + * trigger vmw_bo page dirty tracking due to being kernel side (and thus + * doesn't require mmap'ing) so we have to update the surface's dirty + * regions by hand but we want to be careful to not overwrite the + * resource if it has been written to by the gpu (res_dirty). + */ + if (vps->uo.buffer && vps->uo.buffer->is_dumb) { + struct vmw_surface *surf = vmw_user_object_surface(&vps->uo); + struct vmw_resource *res = &surf->res; + + if (!res->res_dirty && drm_atomic_helper_damage_merged(old_state, + new_state, + &rect)) { + /* + * At some point it might be useful to actually translate + * (rect.x1, rect.y1) => start, and (rect.x2, rect.y2) => end, + * but currently the fb code will just report the entire fb + * dirty so in practice it doesn't matter. + */ + pgoff_t start = res->guest_memory_offset >> PAGE_SHIFT; + pgoff_t end = __KERNEL_DIV_ROUND_UP(res->guest_memory_offset + + res->guest_memory_size, + PAGE_SIZE); + vmw_resource_dirty_update(res, start, end); + } + } + /* * This should only happen if the buffer object is too large to create a * proxy surface for. @@ -1072,7 +1097,7 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, return 0; out_srf_unref: - vmw_surface_unreference(&vps->surf); + vmw_user_object_unref(&vps->uo); return ret; } @@ -1214,14 +1239,8 @@ static uint32_t vmw_stdu_surface_fifo_size_same_display(struct vmw_du_update_plane *update, uint32_t num_hits) { - struct vmw_framebuffer_surface *vfbs; uint32_t size = 0; - vfbs = container_of(update->vfb, typeof(*vfbs), base); - - if (vfbs->is_bo_proxy) - size += sizeof(struct vmw_stdu_update_gb_image) * num_hits; - size += sizeof(struct vmw_stdu_update); return size; @@ -1230,61 +1249,14 @@ vmw_stdu_surface_fifo_size_same_display(struct vmw_du_update_plane *update, static uint32_t vmw_stdu_surface_fifo_size(struct vmw_du_update_plane *update, uint32_t num_hits) { - struct vmw_framebuffer_surface *vfbs; uint32_t size = 0; - vfbs = container_of(update->vfb, typeof(*vfbs), base); - - if (vfbs->is_bo_proxy) - size += sizeof(struct vmw_stdu_update_gb_image) * num_hits; - size += sizeof(struct vmw_stdu_surface_copy) + sizeof(SVGA3dCopyBox) * num_hits + sizeof(struct vmw_stdu_update); return size; } -static uint32_t -vmw_stdu_surface_update_proxy(struct vmw_du_update_plane *update, void *cmd) -{ - struct vmw_framebuffer_surface *vfbs; - struct drm_plane_state *state = update->plane->state; - struct drm_plane_state *old_state = update->old_state; - struct vmw_stdu_update_gb_image *cmd_update = cmd; - struct drm_atomic_helper_damage_iter iter; - struct drm_rect clip; - uint32_t copy_size = 0; - - vfbs = container_of(update->vfb, typeof(*vfbs), base); - - /* - * proxy surface is special where a buffer object type fb is wrapped - * in a surface and need an update gb image command to sync with device. - */ - drm_atomic_helper_damage_iter_init(&iter, old_state, state); - drm_atomic_for_each_plane_damage(&iter, &clip) { - SVGA3dBox *box = &cmd_update->body.box; - - cmd_update->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE; - cmd_update->header.size = sizeof(cmd_update->body); - cmd_update->body.image.sid = vfbs->surface->res.id; - cmd_update->body.image.face = 0; - cmd_update->body.image.mipmap = 0; - - box->x = clip.x1; - box->y = clip.y1; - box->z = 0; - box->w = drm_rect_width(&clip); - box->h = drm_rect_height(&clip); - box->d = 1; - - copy_size += sizeof(*cmd_update); - cmd_update++; - } - - return copy_size; -} - static uint32_t vmw_stdu_surface_populate_copy(struct vmw_du_update_plane *update, void *cmd, uint32_t num_hits) @@ -1299,7 +1271,7 @@ vmw_stdu_surface_populate_copy(struct vmw_du_update_plane *update, void *cmd, cmd_copy->header.id = SVGA_3D_CMD_SURFACE_COPY; cmd_copy->header.size = sizeof(cmd_copy->body) + sizeof(SVGA3dCopyBox) * num_hits; - cmd_copy->body.src.sid = vfbs->surface->res.id; + cmd_copy->body.src.sid = vmw_user_object_surface(&vfbs->uo)->res.id; cmd_copy->body.dest.sid = stdu->display_srf->res.id; return sizeof(*cmd_copy); @@ -1370,10 +1342,7 @@ static int vmw_stdu_plane_update_surface(struct vmw_private *dev_priv, srf_update.mutex = &dev_priv->cmdbuf_mutex; srf_update.intr = true; - if (vfbs->is_bo_proxy) - srf_update.post_prepare = vmw_stdu_surface_update_proxy; - - if (vfbs->surface->res.id != stdu->display_srf->res.id) { + if (vmw_user_object_surface(&vfbs->uo)->res.id != stdu->display_srf->res.id) { srf_update.calc_fifo_size = vmw_stdu_surface_fifo_size; srf_update.pre_clip = vmw_stdu_surface_populate_copy; srf_update.clip = vmw_stdu_surface_populate_clip; @@ -1417,7 +1386,7 @@ vmw_stdu_primary_plane_atomic_update(struct drm_plane *plane, stdu = vmw_crtc_to_stdu(crtc); dev_priv = vmw_priv(crtc->dev); - stdu->display_srf = vps->surf; + stdu->display_srf = vmw_user_object_surface(&vps->uo); stdu->content_fb_type = vps->content_fb_type; stdu->cpp = vps->cpp; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index e7a744dfcecf..8ae6a761c900 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /************************************************************************** * - * Copyright 2009-2023 VMware, Inc., Palo Alto, CA., USA + * Copyright (c) 2009-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -36,9 +37,6 @@ #include #define SVGA3D_FLAGS_64(upper32, lower32) (((uint64_t)upper32 << 32) | lower32) -#define SVGA3D_FLAGS_UPPER_32(svga3d_flags) (svga3d_flags >> 32) -#define SVGA3D_FLAGS_LOWER_32(svga3d_flags) \ - (svga3d_flags & ((uint64_t)U32_MAX)) /** * struct vmw_user_surface - User-space visible surface resource @@ -686,6 +684,14 @@ static void vmw_user_surface_base_release(struct ttm_base_object **p_base) struct vmw_resource *res = &user_srf->srf.res; *p_base = NULL; + + /* + * Dumb buffers own the resource and they'll unref the + * resource themselves + */ + if (res && res->guest_memory_bo && res->guest_memory_bo->is_dumb) + return; + vmw_resource_unreference(&res); } @@ -812,7 +818,8 @@ int vmw_surface_define_ioctl(struct drm_device *dev, void *data, } } res->guest_memory_size = cur_bo_offset; - if (metadata->scanout && + if (!file_priv->atomic && + metadata->scanout && metadata->num_sizes == 1 && metadata->sizes[0].width == VMW_CURSOR_SNOOP_WIDTH && metadata->sizes[0].height == VMW_CURSOR_SNOOP_HEIGHT && @@ -864,6 +871,7 @@ int vmw_surface_define_ioctl(struct drm_device *dev, void *data, vmw_resource_unreference(&res); goto out_unlock; } + vmw_bo_add_detached_resource(res->guest_memory_bo, res); } tmp = vmw_resource_reference(&srf->res); @@ -892,6 +900,113 @@ int vmw_surface_define_ioctl(struct drm_device *dev, void *data, return ret; } +static struct vmw_user_surface * +vmw_lookup_user_surface_for_buffer(struct vmw_private *vmw, struct vmw_bo *bo, + u32 handle) +{ + struct vmw_user_surface *user_srf = NULL; + struct vmw_surface *surf; + struct ttm_base_object *base; + + surf = vmw_bo_surface(bo); + if (surf) { + rcu_read_lock(); + user_srf = container_of(surf, struct vmw_user_surface, srf); + base = &user_srf->prime.base; + if (base && !kref_get_unless_zero(&base->refcount)) { + drm_dbg_driver(&vmw->drm, + "%s: referencing a stale surface handle %d\n", + __func__, handle); + base = NULL; + user_srf = NULL; + } + rcu_read_unlock(); + } + + return user_srf; +} + +struct vmw_surface *vmw_lookup_surface_for_buffer(struct vmw_private *vmw, + struct vmw_bo *bo, + u32 handle) +{ + struct vmw_user_surface *user_srf = + vmw_lookup_user_surface_for_buffer(vmw, bo, handle); + struct vmw_surface *surf = NULL; + struct ttm_base_object *base; + + if (user_srf) { + surf = vmw_surface_reference(&user_srf->srf); + base = &user_srf->prime.base; + ttm_base_object_unref(&base); + } + return surf; +} + +u32 vmw_lookup_surface_handle_for_buffer(struct vmw_private *vmw, + struct vmw_bo *bo, + u32 handle) +{ + struct vmw_user_surface *user_srf = + vmw_lookup_user_surface_for_buffer(vmw, bo, handle); + int surf_handle = 0; + struct ttm_base_object *base; + + if (user_srf) { + base = &user_srf->prime.base; + surf_handle = (u32)base->handle; + ttm_base_object_unref(&base); + } + return surf_handle; +} + +static int vmw_buffer_prime_to_surface_base(struct vmw_private *dev_priv, + struct drm_file *file_priv, + u32 fd, u32 *handle, + struct ttm_base_object **base_p) +{ + struct ttm_base_object *base; + struct vmw_bo *bo; + struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; + struct vmw_user_surface *user_srf; + int ret; + + ret = drm_gem_prime_fd_to_handle(&dev_priv->drm, file_priv, fd, handle); + if (ret) { + drm_warn(&dev_priv->drm, + "Wasn't able to find user buffer for fd = %u.\n", fd); + return ret; + } + + ret = vmw_user_bo_lookup(file_priv, *handle, &bo); + if (ret) { + drm_warn(&dev_priv->drm, + "Wasn't able to lookup user buffer for handle = %u.\n", *handle); + return ret; + } + + user_srf = vmw_lookup_user_surface_for_buffer(dev_priv, bo, *handle); + if (WARN_ON(!user_srf)) { + drm_warn(&dev_priv->drm, + "User surface fd %d (handle %d) is null.\n", fd, *handle); + ret = -EINVAL; + goto out; + } + + base = &user_srf->prime.base; + ret = ttm_ref_object_add(tfile, base, NULL, false); + if (ret) { + drm_warn(&dev_priv->drm, + "Couldn't add an object ref for the buffer (%d).\n", *handle); + goto out; + } + + *base_p = base; +out: + vmw_user_bo_unref(&bo); + + return ret; +} static int vmw_surface_handle_reference(struct vmw_private *dev_priv, @@ -901,15 +1016,19 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv, struct ttm_base_object **base_p) { struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; - struct vmw_user_surface *user_srf; + struct vmw_user_surface *user_srf = NULL; uint32_t handle; struct ttm_base_object *base; int ret; if (handle_type == DRM_VMW_HANDLE_PRIME) { ret = ttm_prime_fd_to_handle(tfile, u_handle, &handle); - if (unlikely(ret != 0)) - return ret; + if (ret) + return vmw_buffer_prime_to_surface_base(dev_priv, + file_priv, + u_handle, + &handle, + base_p); } else { handle = u_handle; } @@ -1503,7 +1622,12 @@ vmw_gb_surface_define_internal(struct drm_device *dev, ret = vmw_user_bo_lookup(file_priv, req->base.buffer_handle, &res->guest_memory_bo); if (ret == 0) { - if (res->guest_memory_bo->tbo.base.size < res->guest_memory_size) { + if (res->guest_memory_bo->is_dumb) { + VMW_DEBUG_USER("Can't backup surface with a dumb buffer.\n"); + vmw_user_bo_unref(&res->guest_memory_bo); + ret = -EINVAL; + goto out_unlock; + } else if (res->guest_memory_bo->tbo.base.size < res->guest_memory_size) { VMW_DEBUG_USER("Surface backup buffer too small.\n"); vmw_user_bo_unref(&res->guest_memory_bo); ret = -EINVAL; @@ -1560,6 +1684,7 @@ vmw_gb_surface_define_internal(struct drm_device *dev, rep->handle = user_srf->prime.base.handle; rep->backup_size = res->guest_memory_size; if (res->guest_memory_bo) { + vmw_bo_add_detached_resource(res->guest_memory_bo, res); rep->buffer_map_handle = drm_vma_node_offset_addr(&res->guest_memory_bo->tbo.base.vma_node); rep->buffer_size = res->guest_memory_bo->tbo.base.size; @@ -2100,3 +2225,140 @@ int vmw_gb_surface_define(struct vmw_private *dev_priv, out_unlock: return ret; } + +static SVGA3dSurfaceFormat vmw_format_bpp_to_svga(struct vmw_private *vmw, + int bpp) +{ + switch (bpp) { + case 8: /* DRM_FORMAT_C8 */ + return SVGA3D_P8; + case 16: /* DRM_FORMAT_RGB565 */ + return SVGA3D_R5G6B5; + case 32: /* DRM_FORMAT_XRGB8888 */ + if (has_sm4_context(vmw)) + return SVGA3D_B8G8R8X8_UNORM; + return SVGA3D_X8R8G8B8; + default: + drm_warn(&vmw->drm, "Unsupported format bpp: %d\n", bpp); + return SVGA3D_X8R8G8B8; + } +} + +/** + * vmw_dumb_create - Create a dumb kms buffer + * + * @file_priv: Pointer to a struct drm_file identifying the caller. + * @dev: Pointer to the drm device. + * @args: Pointer to a struct drm_mode_create_dumb structure + * Return: Zero on success, negative error code on failure. + * + * This is a driver callback for the core drm create_dumb functionality. + * Note that this is very similar to the vmw_bo_alloc ioctl, except + * that the arguments have a different format. + */ +int vmw_dumb_create(struct drm_file *file_priv, + struct drm_device *dev, + struct drm_mode_create_dumb *args) +{ + struct vmw_private *dev_priv = vmw_priv(dev); + struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; + struct vmw_bo *vbo = NULL; + struct vmw_resource *res = NULL; + union drm_vmw_gb_surface_create_ext_arg arg = { 0 }; + struct drm_vmw_gb_surface_create_ext_req *req = &arg.req; + int ret; + struct drm_vmw_size drm_size = { + .width = args->width, + .height = args->height, + .depth = 1, + }; + SVGA3dSurfaceFormat format = vmw_format_bpp_to_svga(dev_priv, args->bpp); + const struct SVGA3dSurfaceDesc *desc = vmw_surface_get_desc(format); + SVGA3dSurfaceAllFlags flags = SVGA3D_SURFACE_HINT_TEXTURE | + SVGA3D_SURFACE_HINT_RENDERTARGET | + SVGA3D_SURFACE_SCREENTARGET | + SVGA3D_SURFACE_BIND_SHADER_RESOURCE | + SVGA3D_SURFACE_BIND_RENDER_TARGET; + + /* + * Without mob support we're just going to use raw memory buffer + * because we wouldn't be able to support full surface coherency + * without mobs + */ + if (!dev_priv->has_mob) { + int cpp = DIV_ROUND_UP(args->bpp, 8); + + switch (cpp) { + case 1: /* DRM_FORMAT_C8 */ + case 2: /* DRM_FORMAT_RGB565 */ + case 4: /* DRM_FORMAT_XRGB8888 */ + break; + default: + /* + * Dumb buffers don't allow anything else. + * This is tested via IGT's dumb_buffers + */ + return -EINVAL; + } + + args->pitch = args->width * cpp; + args->size = ALIGN(args->pitch * args->height, PAGE_SIZE); + + ret = vmw_gem_object_create_with_handle(dev_priv, file_priv, + args->size, &args->handle, + &vbo); + /* drop reference from allocate - handle holds it now */ + drm_gem_object_put(&vbo->tbo.base); + return ret; + } + + req->version = drm_vmw_gb_surface_v1; + req->multisample_pattern = SVGA3D_MS_PATTERN_NONE; + req->quality_level = SVGA3D_MS_QUALITY_NONE; + req->buffer_byte_stride = 0; + req->must_be_zero = 0; + req->base.svga3d_flags = SVGA3D_FLAGS_LOWER_32(flags); + req->svga3d_flags_upper_32_bits = SVGA3D_FLAGS_UPPER_32(flags); + req->base.format = (uint32_t)format; + req->base.drm_surface_flags = drm_vmw_surface_flag_scanout; + req->base.drm_surface_flags |= drm_vmw_surface_flag_shareable; + req->base.drm_surface_flags |= drm_vmw_surface_flag_create_buffer; + req->base.drm_surface_flags |= drm_vmw_surface_flag_coherent; + req->base.base_size.width = args->width; + req->base.base_size.height = args->height; + req->base.base_size.depth = 1; + req->base.array_size = 0; + req->base.mip_levels = 1; + req->base.multisample_count = 0; + req->base.buffer_handle = SVGA3D_INVALID_ID; + req->base.autogen_filter = SVGA3D_TEX_FILTER_NONE; + ret = vmw_gb_surface_define_ext_ioctl(dev, &arg, file_priv); + if (ret) { + drm_warn(dev, "Unable to create a dumb buffer\n"); + return ret; + } + + args->handle = arg.rep.buffer_handle; + args->size = arg.rep.buffer_size; + args->pitch = vmw_surface_calculate_pitch(desc, &drm_size); + + ret = vmw_user_resource_lookup_handle(dev_priv, tfile, arg.rep.handle, + user_surface_converter, + &res); + if (ret) { + drm_err(dev, "Created resource handle doesn't exist!\n"); + goto err; + } + + vbo = res->guest_memory_bo; + vbo->is_dumb = true; + vbo->dumb_surface = vmw_res_to_srf(res); + +err: + if (res) + vmw_resource_unreference(&res); + if (ret) + ttm_ref_object_base_unref(tfile, arg.rep.handle); + + return ret; +} From 12bed149a3460bb6efbca697b4a46e11c819db86 Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Mon, 22 Jul 2024 14:41:16 -0400 Subject: [PATCH 0063/1052] drm/vmwgfx: Add basic support for external buffers Make vmwgfx go through the dma-buf interface to map/unmap imported buffers. The driver used to try to directly manipulate external buffers, assuming that everything that was coming to it had to live in cpu accessible memory. While technically true because what's in the vms is controlled by us, it's semantically completely broken. Fix importing of external buffers by forwarding all memory access requests to the importer. Tested by the vmw_prime basic_vgem test. Signed-off-by: Zack Rusin Reviewed-by: Maaz Mombasawala Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240722184313.181318-5-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_gem.c | 62 +++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c index 07185c108218..b9857f37ca1a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* - * Copyright 2021-2023 VMware, Inc. + * Copyright (c) 2021-2024 Broadcom. All Rights Reserved. The term + * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -78,6 +79,59 @@ static struct sg_table *vmw_gem_object_get_sg_table(struct drm_gem_object *obj) return drm_prime_pages_to_sg(obj->dev, vmw_tt->dma_ttm.pages, vmw_tt->dma_ttm.num_pages); } +static int vmw_gem_vmap(struct drm_gem_object *obj, struct iosys_map *map) +{ + struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(obj); + int ret; + + if (obj->import_attach) { + ret = dma_buf_vmap(obj->import_attach->dmabuf, map); + if (!ret) { + if (drm_WARN_ON(obj->dev, map->is_iomem)) { + dma_buf_vunmap(obj->import_attach->dmabuf, map); + return -EIO; + } + } + } else { + ret = ttm_bo_vmap(bo, map); + } + + return ret; +} + +static void vmw_gem_vunmap(struct drm_gem_object *obj, struct iosys_map *map) +{ + if (obj->import_attach) + dma_buf_vunmap(obj->import_attach->dmabuf, map); + else + drm_gem_ttm_vunmap(obj, map); +} + +static int vmw_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) +{ + int ret; + + if (obj->import_attach) { + /* + * Reset both vm_ops and vm_private_data, so we don't end up with + * vm_ops pointing to our implementation if the dma-buf backend + * doesn't set those fields. + */ + vma->vm_private_data = NULL; + vma->vm_ops = NULL; + + ret = dma_buf_mmap(obj->dma_buf, vma, 0); + + /* Drop the reference drm_gem_mmap_obj() acquired.*/ + if (!ret) + drm_gem_object_put(obj); + + return ret; + } + + return drm_gem_ttm_mmap(obj, vma); +} + static const struct vm_operations_struct vmw_vm_ops = { .pfn_mkwrite = vmw_bo_vm_mkwrite, .page_mkwrite = vmw_bo_vm_mkwrite, @@ -94,9 +148,9 @@ static const struct drm_gem_object_funcs vmw_gem_object_funcs = { .pin = vmw_gem_object_pin, .unpin = vmw_gem_object_unpin, .get_sg_table = vmw_gem_object_get_sg_table, - .vmap = drm_gem_ttm_vmap, - .vunmap = drm_gem_ttm_vunmap, - .mmap = drm_gem_ttm_mmap, + .vmap = vmw_gem_vmap, + .vunmap = vmw_gem_vunmap, + .mmap = vmw_gem_mmap, .vm_ops = &vmw_vm_ops, }; From cb372a505a994cb39aa75acfb8b3bcf94787cf94 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Fri, 19 Jul 2024 11:36:27 -0500 Subject: [PATCH 0064/1052] drm/vmwgfx: Fix overlay when using Screen Targets This code was never updated to support Screen Targets. Fixes a bug where Xv playback displays a green screen instead of actual video contents when 3D acceleration is disabled in the guest. Fixes: c8261a961ece ("vmwgfx: Major KMS refactoring / cleanup in preparation of screen targets") Reported-by: Doug Brown Closes: https://lore.kernel.org/all/bd9cb3c7-90e8-435d-bc28-0e38fee58977@schmorgal.com Signed-off-by: Ian Forbes Tested-by: Doug Brown Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240719163627.20888-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c index c45b4724e414..e20f64b67b26 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c @@ -92,7 +92,7 @@ static int vmw_overlay_send_put(struct vmw_private *dev_priv, { struct vmw_escape_video_flush *flush; size_t fifo_size; - bool have_so = (dev_priv->active_display_unit == vmw_du_screen_object); + bool have_so = (dev_priv->active_display_unit != vmw_du_legacy); int i, num_items; SVGAGuestPtr ptr; From f333a3c7e8323499aa65038e77fe8f3199d4e283 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Jul 2024 16:07:07 +0930 Subject: [PATCH 0065/1052] btrfs: tree-checker: validate dref root and objectid [CORRUPTION] There is a bug report that btrfs flips RO due to a corruption in the extent tree, the involved dumps looks like this: item 188 key (402811572224 168 4096) itemoff 14598 itemsize 79 extent refs 3 gen 3678544 flags 1 ref#0: extent data backref root 13835058055282163977 objectid 281473384125923 offset 81432576 count 1 ref#1: shared data backref parent 1947073626112 count 1 ref#2: shared data backref parent 1156030103552 count 1 BTRFS critical (device vdc1: state EA): unable to find ref byte nr 402811572224 parent 0 root 265 owner 28703026 offset 81432576 slot 189 BTRFS error (device vdc1: state EA): failed to run delayed ref for logical 402811572224 num_bytes 4096 type 178 action 2 ref_mod 1: -2 [CAUSE] The corrupted entry is ref#0 of item 188. The root number 13835058055282163977 is beyond the upper limit for root items (the current limit is 1 << 48), and the objectid also looks suspicious. Only the offset and count is correct. [ENHANCEMENT] Although it's still unknown why we have such many bytes corrupted randomly, we can still enhance the tree-checker for data backrefs by: - Validate the root value For now there should only be 3 types of roots can have data backref: * subvolume trees * data reloc trees * root tree Only for v1 space cache - validate the objectid value The objectid should be a valid inode number. Hopefully we can catch such problem in the future with the new checkers. Reported-by: Kai Krakow Link: https://lore.kernel.org/linux-btrfs/CAMthOuPjg5RDT-G_LXeBBUUtzt3cq=JywF+D1_h+JYxe=WKp-Q@mail.gmail.com/#t Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 6388786fd8b5..18bb0c930db4 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1289,6 +1289,19 @@ static void extent_err(const struct extent_buffer *eb, int slot, va_end(args); } +static bool is_valid_dref_root(u64 rootid) +{ + /* + * The following tree root objectids are allowed to have a data backref: + * - subvolume trees + * - data reloc tree + * - tree root + * For v1 space cache + */ + return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || + rootid == BTRFS_ROOT_TREE_OBJECTID; +} + static int check_extent_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) @@ -1441,6 +1454,8 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u64 seq; + u64 dref_root; + u64 dref_objectid; u64 dref_offset; u64 inline_offset; u8 inline_type; @@ -1484,11 +1499,26 @@ static int check_extent_item(struct extent_buffer *leaf, */ case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); + dref_root = btrfs_extent_data_ref_root(leaf, dref); + dref_objectid = btrfs_extent_data_ref_objectid(leaf, dref); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); seq = hash_extent_data_ref( btrfs_extent_data_ref_root(leaf, dref), btrfs_extent_data_ref_objectid(leaf, dref), btrfs_extent_data_ref_offset(leaf, dref)); + if (unlikely(!is_valid_dref_root(dref_root))) { + extent_err(leaf, slot, + "invalid data ref root value %llu", + dref_root); + return -EUCLEAN; + } + if (unlikely(dref_objectid < BTRFS_FIRST_FREE_OBJECTID || + dref_objectid > BTRFS_LAST_FREE_OBJECTID)) { + extent_err(leaf, slot, + "invalid data ref objectid value %llu", + dref_root); + return -EUCLEAN; + } if (unlikely(!IS_ALIGNED(dref_offset, fs_info->sectorsize))) { extent_err(leaf, slot, @@ -1627,6 +1657,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return -EUCLEAN; } for (; ptr < end; ptr += sizeof(*dref)) { + u64 root; + u64 objectid; u64 offset; /* @@ -1634,7 +1666,22 @@ static int check_extent_data_ref(struct extent_buffer *leaf, * overflow from the leaf due to hash collisions. */ dref = (struct btrfs_extent_data_ref *)ptr; + root = btrfs_extent_data_ref_root(leaf, dref); + objectid = btrfs_extent_data_ref_objectid(leaf, dref); offset = btrfs_extent_data_ref_offset(leaf, dref); + if (unlikely(!is_valid_dref_root(root))) { + extent_err(leaf, slot, + "invalid extent data backref root value %llu", + root); + return -EUCLEAN; + } + if (unlikely(objectid < BTRFS_FIRST_FREE_OBJECTID || + objectid > BTRFS_LAST_FREE_OBJECTID)) { + extent_err(leaf, slot, + "invalid extent data backref objectid value %llu", + root); + return -EUCLEAN; + } if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", From de9f46cb0044a9b9f825d7695ae235863461dc00 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Jul 2024 13:33:23 +0100 Subject: [PATCH 0066/1052] btrfs: fix corrupt read due to bad offset of a compressed extent map If we attempt to insert a compressed extent map that has a range that overlaps another extent map we have in the inode's extent map tree, we can end up with an incorrect offset after adjusting the new extent map at merge_extent_mapping() because we don't update the extent map's offset. For example consider the following scenario: 1) We have a file extent item for a compressed extent covering the file range [108K, 144K) and currently there's no corresponding extent map in the inode's extent map tree; 2) The inode's size is 141K; 3) We have an encoded write (compressed) into the file range [120K, 128K), which overlaps the existing file extent item. The encoded write creates a matching extent map, adds it to the inode's extent map tree and creates an ordered extent for it. Note that the corresponding file extent item is added to the subvolume tree only when the ordered extent completes (when executing btrfs_finish_one_ordered()); 4) We have a write into the file range [160K, 164K). This writes increases the i_size of the file, and there's a hole between the current i_size (141K) and the start offset of this write, and since the old i_size is in the middle of the block [140K, 144K), we have to write zeroes to the range [141K, 144K) (3072 bytes) and therefore dirty that page. We then call btrfs_set_extent_delalloc() with a start offset of 140K. We then end up at btrfs_find_new_delalloc_bytes() which will call btrfs_get_extent() for the range [140K, 144K); 5) The btrfs_get_extent() doesn't find any extent map in the inode's extent map tree covering the range [140K, 144K), so it searches the subvolume tree for any file extent items covering that range. There it finds the file extent item for the range [108K, 144K), creates a compressed extent map for that range and then calls btrfs_add_extent_mapping() with that extent map and passes the range [140K, 144K) via the "start" and "len" parameters; 6) The call to add_extent_mapping() done by btrfs_add_extent_mapping() fails with -EEXIST because there's an extent map, created at step 2 for the [120K, 128K) range, that covers that overlaps with the range of the given extent map ([108K, 144K)). Then it does a lookup for extent map from step 2 add calls merge_extent_mapping() to adjust the input extent map ([108K, 144K)). That adjust the extent map to a start offset of 128K and a length of 16K (starting just after the extent map from step 2), but it does not update the offset field of the extent map, leaving it with a value of zero instead of updating to a value of 20K (128K - 108K = 20K). As a result any read for the range [128K, 144K) can return incorrect data since we read from a wrong section of the extent (unless both the correct and incorrect ranges happen to have the same data). So fix this by changing merge_extent_mapping() to update the extent map's offset even if it's compressed. Also add a test case to the self tests. This didn't happen before the patchset that does big changes in the extent map structure (which includes the commit in the Fixes tag below) because we kept track of the original start offset in the extent map (member "orig_start") so we could always calculate the correct offset by subtracting that offset from the start offset. A test case for fstests that triggered this problem using send/receive with compressed writes will be added soon. Fixes: 3d2ac9922465 ("btrfs: introduce new members for extent_map") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 +- fs/btrfs/tests/extent-map-tests.c | 99 +++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6961cc73fe3f..b180d631545b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -664,7 +664,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, start_diff = start - em->start; em->start = start; em->len = end - start; - if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE && !extent_map_is_compressed(em)) + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) em->offset += start_diff; return add_extent_mapping(inode, em, 0); } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index ebec4ab361b8..56e61ac1cc64 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -900,6 +900,102 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) return ret; } +/* + * Test a regression for compressed extent map adjustment when we attempt to + * add an extent map that is partially overlapped by another existing extent + * map. The resulting extent map offset was left unchanged despite having + * incremented its start offset. + */ +static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + int ret; + int ret2; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Compressed extent for the file range [120K, 128K). */ + em->start = SZ_1K * 120; + em->len = SZ_8K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_8K; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [120K, 128K)"); + goto out; + } + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* + * Compressed extent for the file range [108K, 144K), which overlaps + * with the [120K, 128K) we previously inserted. + */ + em->start = SZ_1K * 108; + em->len = SZ_1K * 36; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_1K * 36; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + + /* + * Try to add the extent map but with a search range of [140K, 144K), + * this should succeed and adjust the extent map to the range + * [128K, 144K), with a length of 16K and an offset of 20K. + * + * This simulates a scenario where in the subvolume tree of an inode we + * have a compressed file extent item for the range [108K, 144K) and we + * have an overlapping compressed extent map for the range [120K, 128K), + * which was created by an encoded write, but its ordered extent was not + * yet completed, so the subvolume tree doesn't have yet the file extent + * item for that range - we only have the extent map in the inode's + * extent map tree. + */ + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [108K, 144K)"); + goto out; + } + + if (em->start != SZ_128K) { + test_err("unexpected extent map start %llu (should be 128K)", em->start); + ret = -EINVAL; + goto out; + } + if (em->len != SZ_16K) { + test_err("unexpected extent map length %llu (should be 16K)", em->len); + ret = -EINVAL; + goto out; + } + if (em->offset != SZ_1K * 20) { + test_err("unexpected extent map offset %llu (should be 20K)", em->offset); + ret = -EINVAL; + goto out; + } +out: + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -1076,6 +1172,9 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_7(fs_info, BTRFS_I(inode)); + if (ret) + goto out; + ret = test_case_8(fs_info, BTRFS_I(inode)); if (ret) goto out; From 98ba1d931f611e8f8f519c0405fa0a1a76554bfa Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Wed, 24 Jul 2024 15:21:06 -0700 Subject: [PATCH 0067/1052] bnxt_en: Fix RSS logic in __bnxt_reserve_rings() In __bnxt_reserve_rings(), the existing code unconditionally sets the default RSS indirection table to default if netif_is_rxfh_configured() returns false. This used to be correct before we added RSS contexts support. For example, if the user is changing the number of ethtool channels, we will enter this path to reserve the new number of rings. We will then set the RSS indirection table to default to cover the new number of rings if netif_is_rxfh_configured() is false. Now, with RSS contexts support, if the user has added or deleted RSS contexts, we may now enter this path to reserve the new number of VNICs. However, netif_is_rxfh_configured() will not return the correct state if we are still in the middle of set_rxfh(). So the existing code may set the indirection table of the default RSS context to default by mistake. Fix it to check if the reservation of the RX rings is changing. Only check netif_is_rxfh_configured() if it is changing. RX rings will not change in the middle of set_rxfh() and this will fix the issue. Fixes: b3d0083caf9a ("bnxt_en: Support RSS contexts in ethtool .{get|set}_rxfh()") Reported-and-tested-by: Jakub Kicinski Link: https://lore.kernel.org/20240625010210.2002310-1-kuba@kernel.org Reviewed-by: Andy Gospodarek Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20240724222106.147744-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ffa74c26ee53..23f74c6c88b9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7649,8 +7649,8 @@ static int bnxt_get_avail_msix(struct bnxt *bp, int num); static int __bnxt_reserve_rings(struct bnxt *bp) { struct bnxt_hw_rings hwr = {0}; + int rx_rings, old_rx_rings, rc; int cp = bp->cp_nr_rings; - int rx_rings, rc; int ulp_msix = 0; bool sh = false; int tx_cp; @@ -7684,6 +7684,7 @@ static int __bnxt_reserve_rings(struct bnxt *bp) hwr.grp = bp->rx_nr_rings; hwr.rss_ctx = bnxt_get_total_rss_ctxs(bp, &hwr); hwr.stat = bnxt_get_func_stat_ctxs(bp); + old_rx_rings = bp->hw_resc.resv_rx_rings; rc = bnxt_hwrm_reserve_rings(bp, &hwr); if (rc) @@ -7738,7 +7739,8 @@ static int __bnxt_reserve_rings(struct bnxt *bp) if (!bnxt_rings_ok(bp, &hwr)) return -ENOMEM; - if (!netif_is_rxfh_configured(bp->dev)) + if (old_rx_rings != bp->hw_resc.resv_rx_rings && + !netif_is_rxfh_configured(bp->dev)) bnxt_set_dflt_rss_indir_tbl(bp, NULL); if (!bnxt_ulp_registered(bp->edev) && BNXT_NEW_RM(bp)) { From a40c7a24f97edda025f53cfe8f0bc6a6e3c12fa6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jul 2024 16:42:48 -0700 Subject: [PATCH 0068/1052] netlink: specs: correct the spec of ethtool The spec for Ethtool is a bit inaccurate. We don't currently support dump. Context is only accepted as input and not echoed to output (which is a separate bug). Fixes: a353318ebf24 ("tools: ynl: populate most of the ethtool spec") Acked-by: Paolo Abeni Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240724234249.2621109-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 495e35fcfb21..ebbd8dd96b5c 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1753,15 +1753,14 @@ operations: request: attributes: - header + - context reply: attributes: - header - - context - hfunc - indir - hkey - input_xfrm - dump: *rss-get-op - name: plca-get-cfg doc: Get PLCA params. From f96aae91b0d260f682e630e092ef70a05a718a43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jul 2024 16:42:49 -0700 Subject: [PATCH 0069/1052] ethtool: rss: echo the context number back The response to a GET request in Netlink should fully identify the queried object. RSS_GET accepts context id as an input, so it must echo that attribute back to the response. After (assuming context 1 has been created): $ ./cli.py --spec netlink/specs/ethtool.yaml \ --do rss-get \ --json '{"header": {"dev-index": 2}, "context": 1}' {'context': 1, 'header': {'dev-index': 2, 'dev-name': 'eth0'}, [...] Fixes: 7112a04664bf ("ethtool: add netlink based get rss support") Acked-by: Paolo Abeni Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240724234249.2621109-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 1 + Documentation/networking/ethtool-netlink.rst | 1 + net/ethtool/rss.c | 8 +++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index ebbd8dd96b5c..ea21fe135b97 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1757,6 +1757,7 @@ operations: reply: attributes: - header + - context - hfunc - indir - hkey diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 3ab423b80e91..d5f246aceb9f 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1875,6 +1875,7 @@ Kernel response contents: ===================================== ====== ========================== ``ETHTOOL_A_RSS_HEADER`` nested reply header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number ``ETHTOOL_A_RSS_HFUNC`` u32 RSS hash func ``ETHTOOL_A_RSS_INDIR`` binary Indir table bytes ``ETHTOOL_A_RSS_HKEY`` binary Hash key bytes diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 71679137eff2..5c4c4505ab9a 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -111,7 +111,8 @@ rss_reply_size(const struct ethnl_req_info *req_base, const struct rss_reply_data *data = RSS_REPDATA(reply_base); int len; - len = nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ + len = nla_total_size(sizeof(u32)) + /* _RSS_CONTEXT */ + nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ nla_total_size(data->hkey_size); /* _RSS_HKEY */ @@ -124,6 +125,11 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); + struct rss_req_info *request = RSS_REQINFO(req_base); + + if (request->rss_context && + nla_put_u32(skb, ETHTOOL_A_RSS_CONTEXT, request->rss_context)) + return -EMSGSIZE; if ((data->hfunc && nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || From 75c3e8a26a35d4f3eee299b3cc7e465f166f4e2d Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Mon, 24 Jun 2024 15:59:51 -0500 Subject: [PATCH 0070/1052] drm/vmwgfx: Trigger a modeset when the screen moves When multi-monitor is cycled the X,Y position of the Screen Target will likely change but the resolution will not. We need to trigger a modeset when this occurs in order to recreate the Screen Target with the correct X,Y position. Fixes a bug where multiple displays are shown in a single scrollable host window rather than in 2+ windows on separate host displays. Fixes: 426826933109 ("drm/vmwgfx: Filter modes which exceed graphics memory") Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240624205951.23343-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 29 +++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index 5106413c14b7..5453f7cf0e2d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -873,6 +873,32 @@ vmw_stdu_connector_mode_valid(struct drm_connector *connector, return MODE_OK; } +/* + * Trigger a modeset if the X,Y position of the Screen Target changes. + * This is needed when multi-mon is cycled. The original Screen Target will have + * the same mode but its relative X,Y position in the topology will change. + */ +static int vmw_stdu_connector_atomic_check(struct drm_connector *conn, + struct drm_atomic_state *state) +{ + struct drm_connector_state *conn_state; + struct vmw_screen_target_display_unit *du; + struct drm_crtc_state *new_crtc_state; + + conn_state = drm_atomic_get_connector_state(state, conn); + du = vmw_connector_to_stdu(conn); + + if (!conn_state->crtc) + return 0; + + new_crtc_state = drm_atomic_get_new_crtc_state(state, conn_state->crtc); + if (du->base.gui_x != du->base.set_gui_x || + du->base.gui_y != du->base.set_gui_y) + new_crtc_state->mode_changed = true; + + return 0; +} + static const struct drm_connector_funcs vmw_stdu_connector_funcs = { .dpms = vmw_du_connector_dpms, .detect = vmw_du_connector_detect, @@ -887,7 +913,8 @@ static const struct drm_connector_funcs vmw_stdu_connector_funcs = { static const struct drm_connector_helper_funcs vmw_stdu_connector_helper_funcs = { .get_modes = vmw_connector_get_modes, - .mode_valid = vmw_stdu_connector_mode_valid + .mode_valid = vmw_stdu_connector_mode_valid, + .atomic_check = vmw_stdu_connector_atomic_check, }; From 0823dc64586ba5ea13a7d200a5d33e4c5fa45950 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 1 Jul 2024 11:31:59 +0800 Subject: [PATCH 0071/1052] vhost-vdpa: switch to use vmf_insert_pfn() in the fault handler remap_pfn_page() should not be called in the fault handler as it may change the vma->flags which may trigger lockdep warning since the vma write lock is not held. Actually there's no need to modify the vma->flags as it has been set in the mmap(). So this patch switches to use vmf_insert_pfn() instead. Reported-by: Dragos Tatulea Tested-by: Dragos Tatulea Fixes: ddd89d0a059d ("vhost_vdpa: support doorbell mapping via mmap") Cc: stable@vger.kernel.org Signed-off-by: Jason Wang Message-Id: <20240701033159.18133-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Michal Kubiak --- drivers/vhost/vdpa.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 63a53680a85c..6b9c12acf438 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1483,13 +1483,7 @@ static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf) notify = ops->get_vq_notification(vdpa, index); - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (remap_pfn_range(vma, vmf->address & PAGE_MASK, - PFN_DOWN(notify.addr), PAGE_SIZE, - vma->vm_page_prot)) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; + return vmf_insert_pfn(vma, vmf->address & PAGE_MASK, PFN_DOWN(notify.addr)); } static const struct vm_operations_struct vhost_vdpa_vm_ops = { From 08f3a5c38087d1569e982a121aad1e6acbf145ce Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 25 Jul 2024 10:29:42 +0800 Subject: [PATCH 0072/1052] net: usb: sr9700: fix uninitialized variable use in sr_mdio_read It could lead to error happen because the variable res is not updated if the call to sr_share_read_word returns an error. In this particular case error code was returned and res stayed uninitialized. Same issue also applies to sr_read_reg. This can be avoided by checking the return value of sr_share_read_word and sr_read_reg, and propagating the error if the read operation failed. Found by code review. Cc: stable@vger.kernel.org Fixes: c9b37458e956 ("USB2NET : SR9700 : One chip USB 1.1 USB2NET SR9700Device Driver Support") Signed-off-by: Ma Ke Reviewed-by: Shigeru Yoshida Reviewed-by: Hariprasad Kelam Signed-off-by: David S. Miller --- drivers/net/usb/sr9700.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c index 0a662e42ed96..cb7d2f798fb4 100644 --- a/drivers/net/usb/sr9700.c +++ b/drivers/net/usb/sr9700.c @@ -179,6 +179,7 @@ static int sr_mdio_read(struct net_device *netdev, int phy_id, int loc) struct usbnet *dev = netdev_priv(netdev); __le16 res; int rc = 0; + int err; if (phy_id) { netdev_dbg(netdev, "Only internal phy supported\n"); @@ -189,11 +190,17 @@ static int sr_mdio_read(struct net_device *netdev, int phy_id, int loc) if (loc == MII_BMSR) { u8 value; - sr_read_reg(dev, SR_NSR, &value); + err = sr_read_reg(dev, SR_NSR, &value); + if (err < 0) + return err; + if (value & NSR_LINKST) rc = 1; } - sr_share_read_word(dev, 1, loc, &res); + err = sr_share_read_word(dev, 1, loc, &res); + if (err < 0) + return err; + if (rc == 1) res = le16_to_cpu(res) | BMSR_LSTATUS; else From 2191a54f63225b548fd8346be3611c3219a24738 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Jul 2024 09:27:45 +0000 Subject: [PATCH 0073/1052] sched: act_ct: take care of padding in struct zones_ht_key Blamed commit increased lookup key size from 2 bytes to 16 bytes, because zones_ht_key got a struct net pointer. Make sure rhashtable_lookup() is not using the padding bytes which are not initialized. BUG: KMSAN: uninit-value in rht_ptr_rcu include/linux/rhashtable.h:376 [inline] BUG: KMSAN: uninit-value in __rhashtable_lookup include/linux/rhashtable.h:607 [inline] BUG: KMSAN: uninit-value in rhashtable_lookup include/linux/rhashtable.h:646 [inline] BUG: KMSAN: uninit-value in rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] BUG: KMSAN: uninit-value in tcf_ct_flow_table_get+0x611/0x2260 net/sched/act_ct.c:329 rht_ptr_rcu include/linux/rhashtable.h:376 [inline] __rhashtable_lookup include/linux/rhashtable.h:607 [inline] rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] tcf_ct_flow_table_get+0x611/0x2260 net/sched/act_ct.c:329 tcf_ct_init+0xa67/0x2890 net/sched/act_ct.c:1408 tcf_action_init_1+0x6cc/0xb30 net/sched/act_api.c:1425 tcf_action_init+0x458/0xf00 net/sched/act_api.c:1488 tcf_action_add net/sched/act_api.c:2061 [inline] tc_ctl_action+0x4be/0x19d0 net/sched/act_api.c:2118 rtnetlink_rcv_msg+0x12fc/0x1410 net/core/rtnetlink.c:6647 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2550 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6665 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0xf52/0x1260 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x10da/0x11e0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 ____sys_sendmsg+0x877/0xb60 net/socket.c:2597 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2651 __sys_sendmsg net/socket.c:2680 [inline] __do_sys_sendmsg net/socket.c:2689 [inline] __se_sys_sendmsg net/socket.c:2687 [inline] __x64_sys_sendmsg+0x307/0x4a0 net/socket.c:2687 x64_sys_call+0x2dd6/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Local variable key created at: tcf_ct_flow_table_get+0x4a/0x2260 net/sched/act_ct.c:324 tcf_ct_init+0xa67/0x2890 net/sched/act_ct.c:1408 Fixes: 88c67aeb1407 ("sched: act_ct: add netns into the key of tcf_ct_flow_table") Reported-by: syzbot+1b5e4e187cc586d05ea0@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Xin Long Reviewed-by: Simon Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sched/act_ct.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 113b907da0f7..3ba8e7e739b5 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -44,6 +44,8 @@ static DEFINE_MUTEX(zones_mutex); struct zones_ht_key { struct net *net; u16 zone; + /* Note : pad[] must be the last field. */ + u8 pad[]; }; struct tcf_ct_flow_table { @@ -60,7 +62,7 @@ struct tcf_ct_flow_table { static const struct rhashtable_params zones_params = { .head_offset = offsetof(struct tcf_ct_flow_table, node), .key_offset = offsetof(struct tcf_ct_flow_table, key), - .key_len = sizeof_field(struct tcf_ct_flow_table, key), + .key_len = offsetof(struct zones_ht_key, pad), .automatic_shrinking = true, }; From 8f4fa0876231c426f880a2bff25ac49fac67d805 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 25 Jul 2024 18:48:36 +0200 Subject: [PATCH 0074/1052] wifi: mac80211: use monitor sdata with driver only if desired In commit 0d9c2beed116 ("wifi: mac80211: fix monitor channel with chanctx emulation") I changed mac80211 to always have an internal monitor_sdata to have something to have the chanctx bound to. However, if the driver didn't also have the WANT_MONITOR flag this would cause mac80211 to allocate it without telling the driver (which was intentional) but also use it for later APIs to the driver without it ever having known about it which was _not_ intentional. Check through the code and only use the monitor_sdata in the relevant places (TX, MU-MIMO follow settings, TX power, and interface iteration) when the WANT_MONITOR flag is set. Cc: stable@vger.kernel.org Fixes: 0d9c2beed116 ("wifi: mac80211: fix monitor channel with chanctx emulation") Reported-by: ZeroBeat Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219086 Tested-by: Lorenzo Bianconi Link: https://patch.msgid.link/20240725184836.25d334157a8e.I02574086da2c5cf0e18264ce5807db6f14ffd9c0@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 7 +++++-- net/mac80211/tx.c | 5 +++-- net/mac80211/util.c | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 85cb71de370f..b02b84ce2130 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -114,7 +114,7 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, /* apply all changes now - no failures allowed */ - if (monitor_sdata) + if (monitor_sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { @@ -3053,6 +3053,9 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return -EOPNOTSUPP; + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!sdata) @@ -3115,7 +3118,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, if (has_monitor) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 72a9ba8bc5fd..edba4a31844f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1768,7 +1768,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, break; } sdata = rcu_dereference(local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; @@ -3957,7 +3957,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, break; } tx.sdata = rcu_dereference(local->monitor_sdata); - if (tx.sdata) { + if (tx.sdata && + ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &tx.sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ced19ce7c51a..c7ad9bc5973a 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -776,7 +776,7 @@ static void __iterate_interfaces(struct ieee80211_local *local, sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); - if (sdata && + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); From baeaabf970b9a90999f62ae27edf63f6cb86c023 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Wed, 24 Jul 2024 18:23:27 +0530 Subject: [PATCH 0075/1052] wifi: cfg80211: fix reporting failed MLO links status with cfg80211_connect_done Individual MLO links connection status is not copied to EVENT_CONNECT_RESULT data while processing the connect response information in cfg80211_connect_done(). Due to this failed links are wrongly indicated with success status in EVENT_CONNECT_RESULT. To fix this, copy the individual MLO links status to the EVENT_CONNECT_RESULT data. Fixes: 53ad07e9823b ("wifi: cfg80211: support reporting failed links") Signed-off-by: Veerendranath Jakkam Reviewed-by: Carlos Llamas Link: https://patch.msgid.link/20240724125327.3495874-1-quic_vjakkam@quicinc.com [commit message editorial changes] Signed-off-by: Johannes Berg --- net/wireless/sme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index e419aa8c4a5a..d9d7bf8bb5c1 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1045,6 +1045,7 @@ void cfg80211_connect_done(struct net_device *dev, cfg80211_hold_bss( bss_from_pub(params->links[link].bss)); ev->cr.links[link].bss = params->links[link].bss; + ev->cr.links[link].status = params->links[link].status; if (params->links[link].addr) { ev->cr.links[link].addr = next; From 6873cc4416078202882691b424fcca5b5fb1a94d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 24 Jul 2024 13:29:12 +0200 Subject: [PATCH 0076/1052] wifi: cfg80211: correct S1G beacon length calculation The minimum header length calculation (equivalent to the start of the elements) for the S1G long beacon erroneously required only up to the start of u.s1g_beacon rather than the start of u.s1g_beacon.variable. Fix that, and also shuffle the branches around a bit to not assign useless values that are overwritten later. Reported-by: syzbot+0f3afa93b91202f21939@syzkaller.appspotmail.com Fixes: 9eaffe5078ca ("cfg80211: convert S1G beacon to scan results") Link: https://patch.msgid.link/20240724132912.9662972db7c1.I8779675b5bbda4994cc66f876b6b87a2361c3c0b@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d99319d82205..64eeed82d43d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3178,8 +3178,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, struct ieee80211_mgmt *mgmt, size_t len, gfp_t gfp) { - size_t min_hdr_len = offsetof(struct ieee80211_mgmt, - u.probe_resp.variable); + size_t min_hdr_len; struct ieee80211_ext *ext = NULL; enum cfg80211_bss_frame_type ftype; u16 beacon_interval; @@ -3202,10 +3201,16 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; - min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); + else + min_hdr_len = offsetof(struct ieee80211_ext, + u.s1g_beacon.variable); + } else { + /* same for beacons */ + min_hdr_len = offsetof(struct ieee80211_mgmt, + u.probe_resp.variable); } if (WARN_ON(len < min_hdr_len)) From 189d7aae8f5a100b0db8b302debbd445475d01e6 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Mon, 22 Jul 2024 11:33:32 +0800 Subject: [PATCH 0077/1052] wifi: ath12k: fix reusing outside iterator in ath12k_wow_vif_set_wakeups() Smatch throws below warning: drivers/net/wireless/ath/ath12k/wow.c:434 ath12k_wow_vif_set_wakeups() warn: reusing outside iterator: 'i' drivers/net/wireless/ath/ath12k/wow.c 411 default: 412 break; 413 } 414 415 for (i = 0; i < wowlan->n_patterns; i++) { ^^^^^^^^^^^^^^^^^^^^^^ Here we loop until ->n_patterns 416 const struct cfg80211_pkt_pattern *eth_pattern = &patterns[i]; 417 struct ath12k_pkt_pattern new_pattern = {}; 418 419 if (WARN_ON(eth_pattern->pattern_len > WOW_MAX_PATTERN_SIZE)) 420 return -EINVAL; 421 422 if (ar->ab->wow.wmi_conf_rx_decap_mode == 423 ATH12K_HW_TXRX_NATIVE_WIFI) { 424 ath12k_wow_convert_8023_to_80211(ar, eth_pattern, 425 &new_pattern); 426 427 if (WARN_ON(new_pattern.pattern_len > WOW_MAX_PATTERN_SIZE)) 428 return -EINVAL; 429 } else { 430 memcpy(new_pattern.pattern, eth_pattern->pattern, 431 eth_pattern->pattern_len); 432 433 /* convert bitmask to bytemask */ --> 434 for (i = 0; i < eth_pattern->pattern_len; i++) 435 if (eth_pattern->mask[i / 8] & BIT(i % 8)) 436 new_pattern.bytemask[i] = 0xff; This loop re-uses i and the loop ends with i == eth_pattern->pattern_len. This looks like a bug. Change to use a new iterator 'j' for the inner loop to fix it. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 Fixes: 4a3c212eee0e ("wifi: ath12k: add basic WoW functionalities") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/d4975b95-9c43-45af-a0ab-80253f18c7f2@stanley.mountain/ Signed-off-by: Baochen Qiang Acked-by: Jeff Johnson Link: https://patch.msgid.link/20240722033332.6273-1-quic_bqiang@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath12k/wow.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wow.c b/drivers/net/wireless/ath/ath12k/wow.c index bead19db2c9a..9b8684abbe40 100644 --- a/drivers/net/wireless/ath/ath12k/wow.c +++ b/drivers/net/wireless/ath/ath12k/wow.c @@ -361,7 +361,7 @@ static int ath12k_wow_vif_set_wakeups(struct ath12k_vif *arvif, struct ath12k *ar = arvif->ar; unsigned long wow_mask = 0; int pattern_id = 0; - int ret, i; + int ret, i, j; /* Setup requested WOW features */ switch (arvif->vdev_type) { @@ -431,9 +431,9 @@ static int ath12k_wow_vif_set_wakeups(struct ath12k_vif *arvif, eth_pattern->pattern_len); /* convert bitmask to bytemask */ - for (i = 0; i < eth_pattern->pattern_len; i++) - if (eth_pattern->mask[i / 8] & BIT(i % 8)) - new_pattern.bytemask[i] = 0xff; + for (j = 0; j < eth_pattern->pattern_len; j++) + if (eth_pattern->mask[j / 8] & BIT(j % 8)) + new_pattern.bytemask[j] = 0xff; new_pattern.pattern_len = eth_pattern->pattern_len; new_pattern.pkt_offset = eth_pattern->pkt_offset; From 6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Thu, 18 Jul 2024 16:46:33 -0700 Subject: [PATCH 0078/1052] wifi: mt76: mt7921: fix null pointer access in mt792x_mac_link_bss_remove Fix null pointer access in mt792x_mac_link_bss_remove. To prevent null pointer access, we should assign the vif to bss_conf in mt7921_add_interface. This ensures that subsequent operations on the BSS can properly reference the correct vif. [ T843] Call Trace: [ T843] [ T843] ? __die+0x1e/0x60 [ T843] ? page_fault_oops+0x157/0x450 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? search_bpf_extables+0x5a/0x80 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? exc_page_fault+0x2bb/0x670 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? lock_timer_base+0x71/0x90 [ T843] ? asm_exc_page_fault+0x26/0x30 [ T843] ? mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib] [ T843] ? mt792x_remove_interface+0x6e/0x90 [mt792x_lib] [ T843] ? ieee80211_do_stop+0x507/0x7e0 [mac80211] [ T843] ? ieee80211_stop+0x53/0x190 [mac80211] [ T843] ? __dev_close_many+0xa5/0x120 [ T843] ? __dev_change_flags+0x18c/0x220 [ T843] ? dev_change_flags+0x21/0x60 [ T843] ? do_setlink+0xdf9/0x11d0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? security_sock_rcv_skb+0x33/0x50 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? __nla_validate_parse+0x61/0xd10 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? genl_done+0x53/0x80 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? netlink_dump+0x357/0x410 [ T843] ? __rtnl_newlink+0x5d6/0x980 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? genl_family_rcv_msg_dumpit+0xdf/0xf0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? __kmalloc_cache_noprof+0x44/0x210 [ T843] ? rtnl_newlink+0x42/0x60 [ T843] ? rtnetlink_rcv_msg+0x152/0x3f0 [ T843] ? mptcp_pm_nl_dump_addr+0x180/0x180 [ T843] ? rtnl_calcit.isra.0+0x130/0x130 [ T843] ? netlink_rcv_skb+0x56/0x100 [ T843] ? netlink_unicast+0x199/0x290 [ T843] ? netlink_sendmsg+0x21d/0x490 [ T843] ? __sock_sendmsg+0x78/0x80 [ T843] ? ____sys_sendmsg+0x23f/0x2e0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? copy_msghdr_from_user+0x68/0xa0 [ T843] ? ___sys_sendmsg+0x81/0xd0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? crng_fast_key_erasure+0xbc/0xf0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? get_random_bytes_user+0x126/0x140 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? __fdget+0xb1/0xe0 [ T843] ? __sys_sendmsg+0x56/0xa0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? do_syscall_64+0x5f/0x170 [ T843] ? entry_SYSCALL_64_after_hwframe+0x55/0x5d [ T843] Fixes: 1541d63c5fe2 ("wifi: mt76: mt7925: add mt7925_mac_link_bss_remove to remove per-link BSS") Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/linux-wireless/2fee61f8c903d02a900ca3188c3742c7effd102e.camel@web.de/#b Signed-off-by: Sean Wang Tested-by: Bert Karwatzki Link: https://patch.msgid.link/20240718234633.12737-1-sean.wang@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 2e6268cb06c0..1bab93d049df 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; mvif->phy = phy; + mvif->bss_conf.vif = mvif; mvif->bss_conf.mt76.band_idx = 0; mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % MT76_CONNAC_MAX_WMM_SETS; From a47f3320bb4ba6714abe8dddb36399367b491358 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 9 Jul 2024 09:31:32 +0200 Subject: [PATCH 0079/1052] wifi: ath12k: fix soft lockup on suspend The ext interrupts are enabled when the firmware has been started, but this may never happen, for example, if the board configuration file is missing. When the system is later suspended, the driver unconditionally tries to disable interrupts, which results in an irq disable imbalance and causes the driver to spin indefinitely in napi_synchronize(). Make sure that the interrupts have been enabled before attempting to disable them. Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices") Cc: stable@vger.kernel.org # 6.3 Signed-off-by: Johan Hovold Acked-by: Jeff Johnson Link: https://patch.msgid.link/20240709073132.9168-1-johan+linaro@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath12k/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/pci.c b/drivers/net/wireless/ath/ath12k/pci.c index 876c029f58f6..9e0b9e329bda 100644 --- a/drivers/net/wireless/ath/ath12k/pci.c +++ b/drivers/net/wireless/ath/ath12k/pci.c @@ -473,7 +473,8 @@ static void __ath12k_pci_ext_irq_disable(struct ath12k_base *ab) { int i; - clear_bit(ATH12K_FLAG_EXT_IRQ_ENABLED, &ab->dev_flags); + if (!test_and_clear_bit(ATH12K_FLAG_EXT_IRQ_ENABLED, &ab->dev_flags)) + return; for (i = 0; i < ATH12K_EXT_IRQ_GRP_NUM_MAX; i++) { struct ath12k_ext_irq_grp *irq_grp = &ab->ext_irq_grp[i]; From 225990c487c1023e7b3aa89beb6a68011fbc0461 Mon Sep 17 00:00:00 2001 From: Mark Mentovai Date: Thu, 25 Jul 2024 16:41:44 -0400 Subject: [PATCH 0080/1052] net: phy: realtek: add support for RTL8366S Gigabit PHY The PHY built in to the Realtek RTL8366S switch controller was previously supported by genphy_driver. This PHY does not implement MMD operations. Since commit 9b01c885be36 ("net: phy: c22: migrate to genphy_c45_write_eee_adv()"), MMD register reads have been made during phy_probe to determine EEE support. For genphy_driver, these reads are transformed into 802.3 annex 22D clause 45-over-clause 22 mmd_phy_indirect operations that perform MII register writes to MII_MMD_CTRL and MII_MMD_DATA. This overwrites those two MII registers, which on this PHY are reserved and have another function, rendering the PHY unusable while so configured. Proper support for this PHY is restored by providing a phy_driver that declares MMD operations as unsupported by using the helper functions provided for that purpose, while remaining otherwise identical to genphy_driver. Fixes: 9b01c885be36 ("net: phy: c22: migrate to genphy_c45_write_eee_adv()") Reported-by: Russell Senior Closes: https://github.com/openwrt/openwrt/issues/15981 Link: https://github.com/openwrt/openwrt/issues/15739 Signed-off-by: Mark Mentovai Reviewed-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index bed839237fb5..87865918dab6 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -1465,6 +1465,13 @@ static struct phy_driver realtek_drvs[] = { .handle_interrupt = genphy_handle_interrupt_no_ack, .suspend = genphy_suspend, .resume = genphy_resume, + }, { + PHY_ID_MATCH_EXACT(0x001cc960), + .name = "RTL8366S Gigabit Ethernet", + .suspend = genphy_suspend, + .resume = genphy_resume, + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, }, }; From e60dc98122110594d0290845160f12916192fc6d Mon Sep 17 00:00:00 2001 From: songxiebing Date: Fri, 26 Jul 2024 18:07:26 +0800 Subject: [PATCH 0081/1052] ALSA: hda: conexant: Fix headset auto detect fail in the polling mode The previous fix (7aeb25908648) only handles the unsol_event reporting during interrupts and does not include the polling mode used to set jackroll_ms, so now we are replacing it with snd_hda_jack_detect_enable_callback. Fixes: 7aeb25908648 ("ALSA: hda/conexant: Fix headset auto detect fail in cx8070 and SN6140") Co-developed-by: bo liu Signed-off-by: bo liu Signed-off-by: songxiebing Link: https://patch.msgid.link/20240726100726.50824-1-soxiebing@163.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 54 ++++++---------------------------- 1 file changed, 9 insertions(+), 45 deletions(-) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 17389a3801bd..4472923ba694 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -21,12 +21,6 @@ #include "hda_jack.h" #include "hda_generic.h" -enum { - CX_HEADSET_NOPRESENT = 0, - CX_HEADSET_PARTPRESENT, - CX_HEADSET_ALLPRESENT, -}; - struct conexant_spec { struct hda_gen_spec gen; @@ -48,7 +42,6 @@ struct conexant_spec { unsigned int gpio_led; unsigned int gpio_mute_led_mask; unsigned int gpio_mic_led_mask; - unsigned int headset_present_flag; bool is_cx8070_sn6140; }; @@ -250,48 +243,19 @@ static void cx_process_headset_plugin(struct hda_codec *codec) } } -static void cx_update_headset_mic_vref(struct hda_codec *codec, unsigned int res) +static void cx_update_headset_mic_vref(struct hda_codec *codec, struct hda_jack_callback *event) { - unsigned int phone_present, mic_persent, phone_tag, mic_tag; - struct conexant_spec *spec = codec->spec; + unsigned int mic_present; /* In cx8070 and sn6140, the node 16 can only be config to headphone or disabled, * the node 19 can only be config to microphone or disabled. * Check hp&mic tag to process headset pulgin&plugout. */ - phone_tag = snd_hda_codec_read(codec, 0x16, 0, AC_VERB_GET_UNSOLICITED_RESPONSE, 0x0); - mic_tag = snd_hda_codec_read(codec, 0x19, 0, AC_VERB_GET_UNSOLICITED_RESPONSE, 0x0); - if ((phone_tag & (res >> AC_UNSOL_RES_TAG_SHIFT)) || - (mic_tag & (res >> AC_UNSOL_RES_TAG_SHIFT))) { - phone_present = snd_hda_codec_read(codec, 0x16, 0, AC_VERB_GET_PIN_SENSE, 0x0); - if (!(phone_present & AC_PINSENSE_PRESENCE)) {/* headphone plugout */ - spec->headset_present_flag = CX_HEADSET_NOPRESENT; - snd_hda_codec_write(codec, 0x19, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, 0x20); - return; - } - if (spec->headset_present_flag == CX_HEADSET_NOPRESENT) { - spec->headset_present_flag = CX_HEADSET_PARTPRESENT; - } else if (spec->headset_present_flag == CX_HEADSET_PARTPRESENT) { - mic_persent = snd_hda_codec_read(codec, 0x19, 0, - AC_VERB_GET_PIN_SENSE, 0x0); - /* headset is present */ - if ((phone_present & AC_PINSENSE_PRESENCE) && - (mic_persent & AC_PINSENSE_PRESENCE)) { - cx_process_headset_plugin(codec); - spec->headset_present_flag = CX_HEADSET_ALLPRESENT; - } - } - } -} - -static void cx_jack_unsol_event(struct hda_codec *codec, unsigned int res) -{ - struct conexant_spec *spec = codec->spec; - - if (spec->is_cx8070_sn6140) - cx_update_headset_mic_vref(codec, res); - - snd_hda_jack_unsol_event(codec, res); + mic_present = snd_hda_codec_read(codec, 0x19, 0, AC_VERB_GET_PIN_SENSE, 0x0); + if (!(mic_present & AC_PINSENSE_PRESENCE)) /* mic plugout */ + snd_hda_codec_write(codec, 0x19, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, 0x20); + else + cx_process_headset_plugin(codec); } static int cx_auto_suspend(struct hda_codec *codec) @@ -305,7 +269,7 @@ static const struct hda_codec_ops cx_auto_patch_ops = { .build_pcms = snd_hda_gen_build_pcms, .init = cx_auto_init, .free = cx_auto_free, - .unsol_event = cx_jack_unsol_event, + .unsol_event = snd_hda_jack_unsol_event, .suspend = cx_auto_suspend, .check_power_status = snd_hda_gen_check_power_status, }; @@ -1163,7 +1127,7 @@ static int patch_conexant_auto(struct hda_codec *codec) case 0x14f11f86: case 0x14f11f87: spec->is_cx8070_sn6140 = true; - spec->headset_present_flag = CX_HEADSET_NOPRESENT; + snd_hda_jack_detect_enable_callback(codec, 0x19, cx_update_headset_mic_vref); break; } From 6cd23b26b348fa52c88e1adf9c0e48d68e13f95e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 26 Jul 2024 16:26:19 +0200 Subject: [PATCH 0082/1052] ALSA: hda/generic: Add a helper to mute speakers at suspend/shutdown Some devices indicate click noises at suspend or shutdown when the speakers are unmuted. This patch adds a helper, snd_hda_gen_shutup_speakers(), to work around it. The new function is supposed to be called at suspend or shutdown by the codec driver, and it mutes the speakers. The mute status isn't cached, hence the original mute state will be restored at resume again. Link: https://patch.msgid.link/20240726142625.2460-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_generic.c | 63 +++++++++++++++++++++++++++++++++++++ sound/pci/hda/hda_generic.h | 1 + 2 files changed, 64 insertions(+) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index f64d9dc197a3..9cff87dfbecb 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -4955,6 +4955,69 @@ void snd_hda_gen_stream_pm(struct hda_codec *codec, hda_nid_t nid, bool on) } EXPORT_SYMBOL_GPL(snd_hda_gen_stream_pm); +/* forcibly mute the speaker output without caching; return true if updated */ +static bool force_mute_output_path(struct hda_codec *codec, hda_nid_t nid) +{ + if (!nid) + return false; + if (!nid_has_mute(codec, nid, HDA_OUTPUT)) + return false; /* no mute, skip */ + if (snd_hda_codec_amp_read(codec, nid, 0, HDA_OUTPUT, 0) & + snd_hda_codec_amp_read(codec, nid, 1, HDA_OUTPUT, 0) & + HDA_AMP_MUTE) + return false; /* both channels already muted, skip */ + + /* direct amp update without caching */ + snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_AMP_GAIN_MUTE, + AC_AMP_SET_OUTPUT | AC_AMP_SET_LEFT | + AC_AMP_SET_RIGHT | HDA_AMP_MUTE); + return true; +} + +/** + * snd_hda_gen_shutup_speakers - Forcibly mute the speaker outputs + * @codec: the HDA codec + * + * Forcibly mute the speaker outputs, to be called at suspend or shutdown. + * + * The mute state done by this function isn't cached, hence the original state + * will be restored at resume. + * + * Return true if the mute state has been changed. + */ +bool snd_hda_gen_shutup_speakers(struct hda_codec *codec) +{ + struct hda_gen_spec *spec = codec->spec; + const int *paths; + const struct nid_path *path; + int i, p, num_paths; + bool updated = false; + + /* if already powered off, do nothing */ + if (!snd_hdac_is_power_on(&codec->core)) + return false; + + if (spec->autocfg.line_out_type == AUTO_PIN_SPEAKER_OUT) { + paths = spec->out_paths; + num_paths = spec->autocfg.line_outs; + } else { + paths = spec->speaker_paths; + num_paths = spec->autocfg.speaker_outs; + } + + for (i = 0; i < num_paths; i++) { + path = snd_hda_get_path_from_idx(codec, paths[i]); + if (!path) + continue; + for (p = 0; p < path->depth; p++) + if (force_mute_output_path(codec, path->path[p])) + updated = true; + } + + return updated; +} +EXPORT_SYMBOL_GPL(snd_hda_gen_shutup_speakers); + /** * snd_hda_gen_parse_auto_config - Parse the given BIOS configuration and * set up the hda_gen_spec diff --git a/sound/pci/hda/hda_generic.h b/sound/pci/hda/hda_generic.h index 8f5ecf740c49..08544601b4ce 100644 --- a/sound/pci/hda/hda_generic.h +++ b/sound/pci/hda/hda_generic.h @@ -353,5 +353,6 @@ int snd_hda_gen_add_mute_led_cdev(struct hda_codec *codec, int snd_hda_gen_add_micmute_led_cdev(struct hda_codec *codec, int (*callback)(struct led_classdev *, enum led_brightness)); +bool snd_hda_gen_shutup_speakers(struct hda_codec *codec); #endif /* __SOUND_HDA_GENERIC_H */ From 4f61c8fe35202702426cfc0003e15116a01ba885 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 26 Jul 2024 16:26:20 +0200 Subject: [PATCH 0083/1052] ALSA: hda/conexant: Mute speakers at suspend / shutdown Use the new helper to mute speakers at suspend / shutdown for avoiding click noises. Link: https://bugzilla.suse.com/show_bug.cgi?id=1228269 Link: https://patch.msgid.link/20240726142625.2460-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 4472923ba694..f030669243f9 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -205,6 +205,8 @@ static void cx_auto_shutdown(struct hda_codec *codec) { struct conexant_spec *spec = codec->spec; + snd_hda_gen_shutup_speakers(codec); + /* Turn the problematic codec into D3 to avoid spurious noises from the internal speaker during (and after) reboot */ cx_auto_turn_eapd(codec, spec->num_eapds, spec->eapds, false); From 952b13c215234855d75ef4b5bb0138075e73677c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 26 Jul 2024 16:34:54 +0200 Subject: [PATCH 0084/1052] ALSA: seq: ump: Optimize conversions from SysEx to UMP The current conversion from the legacy SysEx event to UMP SysEx packet in the sequencer core has a couple of issues: * The first packet trims the SysEx start byte (0xf0), hence it contains only 5 bytes instead of 6. This isn't wrong, per specification, but it's strange not to fill 6 bytes. * When the SysEx end marker (0xf7) is placed at the first byte of the next packet, it'll end up with an empty data just with the END status. It can be rather folded into the previous packet with the END status. This patch tries to address those issues. The first packet may have 6 bytes even with the SysEx start, and an empty packet with the SysEx end marker is omitted. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Cc: Link: https://patch.msgid.link/20240726143455.3254-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 41 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index e90b27a135e6..d9dacfbe4a9a 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -1192,44 +1192,53 @@ static int cvt_sysex_to_ump(struct snd_seq_client *dest, { struct snd_seq_ump_event ev_cvt; unsigned char status; - u8 buf[6], *xbuf; + u8 buf[8], *xbuf; int offset = 0; int len, err; + bool finished = false; if (!snd_seq_ev_is_variable(event)) return 0; setup_ump_event(&ev_cvt, event); - for (;;) { + while (!finished) { len = snd_seq_expand_var_event_at(event, sizeof(buf), buf, offset); if (len <= 0) break; - if (WARN_ON(len > 6)) + if (WARN_ON(len > sizeof(buf))) break; - offset += len; + xbuf = buf; + status = UMP_SYSEX_STATUS_CONTINUE; + /* truncate the sysex start-marker */ if (*xbuf == UMP_MIDI1_MSG_SYSEX_START) { status = UMP_SYSEX_STATUS_START; - xbuf++; len--; - if (len > 0 && xbuf[len - 1] == UMP_MIDI1_MSG_SYSEX_END) { - status = UMP_SYSEX_STATUS_SINGLE; - len--; - } - } else { - if (xbuf[len - 1] == UMP_MIDI1_MSG_SYSEX_END) { - status = UMP_SYSEX_STATUS_END; - len--; - } else { - status = UMP_SYSEX_STATUS_CONTINUE; - } + offset++; + xbuf++; } + + /* if the last of this packet or the 1st byte of the next packet + * is the end-marker, finish the transfer with this packet + */ + if (len > 0 && len < 8 && + xbuf[len - 1] == UMP_MIDI1_MSG_SYSEX_END) { + if (status == UMP_SYSEX_STATUS_START) + status = UMP_SYSEX_STATUS_SINGLE; + else + status = UMP_SYSEX_STATUS_END; + len--; + finished = true; + } + + len = min(len, 6); fill_sysex7_ump(dest_port, ev_cvt.ump, status, xbuf, len); err = __snd_seq_deliver_single_event(dest, dest_port, (struct snd_seq_event *)&ev_cvt, atomic, hop); if (err < 0) return err; + offset += len; } return 0; } From e22a3a9d4134d7e6351a2998771522e74bcc58da Mon Sep 17 00:00:00 2001 From: Kiran K Date: Wed, 3 Jul 2024 14:22:42 +0530 Subject: [PATCH 0085/1052] Bluetooth: btintel: Fail setup on error Do not attempt to send any hci command to controller if *setup* function fails. Fixes: af395330abed ("Bluetooth: btintel: Add Intel devcoredump support") Signed-off-by: Kiran K Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index e7a612504ab1..2ebc970e6573 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -3085,6 +3085,9 @@ static int btintel_setup_combined(struct hci_dev *hdev) btintel_set_dsm_reset_method(hdev, &ver_tlv); err = btintel_bootloader_setup_tlv(hdev, &ver_tlv); + if (err) + goto exit_error; + btintel_register_devcoredump_support(hdev); btintel_print_fseq_info(hdev); break; From d09009bc80d9d0d812b988888c40cd86e52eaf1e Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 16 Jul 2024 15:49:47 +0800 Subject: [PATCH 0086/1052] Bluetooth: btmtk: Fix kernel crash when entering btmtk_usb_suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If MediaTek's Bluetooth setup is unsuccessful, a NULL pointer issue occur when the system is suspended and the anchored kill function is called. To avoid this, add protection to prevent executing the anchored kill function if the setup is unsuccessful. [ 6.922106] Hardware name: Acer Tomato (rev2) board (DT) [ 6.922114] Workqueue: pm pm_runtime_work [ 6.922132] pstate: 804000c9 (Nzcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 6.922147] pc : usb_kill_anchored_urbs+0x6c/0x1e0 [ 6.922164] lr : usb_kill_anchored_urbs+0x48/0x1e0 [ 6.922181] sp : ffff800080903b60 [ 6.922187] x29: ffff800080903b60 x28: ffff2c7b85c32b80 x27: ffff2c7bbb370930 [ 6.922211] x26: 00000000000f4240 x25: 00000000ffffffff x24: ffffd49ece2dcb48 [ 6.922255] x20: ffffffffffffffd8 x19: 0000000000000000 x18: 0000000000000006 [ 6.922276] x17: 6531656337386238 x16: 3632373862333863 x15: ffff800080903480 [ 6.922297] x14: 0000000000000000 x13: 303278302f303178 x12: ffffd49ecf090e30 [ 6.922318] x11: 0000000000000001 x10: 0000000000000001 x9 : ffffd49ecd2c5bb4 [ 6.922339] x8 : c0000000ffffdfff x7 : ffffd49ecefe0db8 x6 : 00000000000affa8 [ 6.922360] x5 : ffff2c7bbb35dd48 x4 : 0000000000000000 x3 : 0000000000000000 [ 6.922379] x2 : 0000000000000000 x1 : 0000000000000003 x0 : ffffffffffffffd8 [ 6.922400] Call trace: [ 6.922405] usb_kill_anchored_urbs+0x6c/0x1e0 [ 6.922422] btmtk_usb_suspend+0x20/0x38 [btmtk 5f200a97badbdfda4266773fee49acfc8e0224d5] [ 6.922444] btusb_suspend+0xd0/0x210 [btusb 0bfbf19a87ff406c83b87268b87ce1e80e9a829b] [ 6.922469] usb_suspend_both+0x90/0x288 [ 6.922487] usb_runtime_suspend+0x3c/0xa8 [ 6.922507] __rpm_callback+0x50/0x1f0 [ 6.922523] rpm_callback+0x70/0x88 [ 6.922538] rpm_suspend+0xe4/0x5a0 [ 6.922553] pm_runtime_work+0xd4/0xe0 [ 6.922569] process_one_work+0x18c/0x440 [ 6.922588] worker_thread+0x314/0x428 [ 6.922606] kthread+0x128/0x138 [ 6.922621] ret_from_fork+0x10/0x20 [ 6.922644] Code: f100a274 54000520 d503201f d100a260 (b8370000) [ 6.922654] ---[ end trace 0000000000000000 ]--- Fixes: ceac1cb0259d ("Bluetooth: btusb: mediatek: add ISO data transmission functions") Signed-off-by: Chris Lu Reported-by: Nícolas F. R. A. Prado #KernelCI Tested-by: Nícolas F. R. A. Prado Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index b7c348687a77..191bc6925120 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -1262,7 +1262,8 @@ int btmtk_usb_suspend(struct hci_dev *hdev) struct btmtk_data *btmtk_data = hci_get_priv(hdev); /* Stop urb anchor for iso data transmission */ - usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor); + if (test_bit(BTMTK_ISOPKT_RUNNING, &btmtk_data->flags)) + usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor); return 0; } From 96b82af36efaa1787946e021aa3dc5410c05beeb Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 15 Jul 2024 10:40:03 -0400 Subject: [PATCH 0087/1052] Bluetooth: hci_sync: Fix suspending with wrong filter policy When suspending the scan filter policy cannot be 0x00 (no acceptlist) since that means the host has to process every advertisement report waking up the system, so this attempts to check if hdev is marked as suspended and if the resulting filter policy would be 0x00 (no acceptlist) then skip passive scanning if thre no devices in the acceptlist otherwise reset the filter policy to 0x01 so the acceptlist is used since the devices programmed there can still wakeup be system. Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index cd2ed16da8a4..a31d39a821f4 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2976,6 +2976,27 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) */ filter_policy = hci_update_accept_list_sync(hdev); + /* If suspended and filter_policy set to 0x00 (no acceptlist) then + * passive scanning cannot be started since that would require the host + * to be woken up to process the reports. + */ + if (hdev->suspended && !filter_policy) { + /* Check if accept list is empty then there is no need to scan + * while suspended. + */ + if (list_empty(&hdev->le_accept_list)) + return 0; + + /* If there are devices is the accept_list that means some + * devices could not be programmed which in non-suspended case + * means filter_policy needs to be set to 0x00 so the host needs + * to filter, but since this is treating suspended case we + * can ignore device needing host to filter to allow devices in + * the acceptlist to be able to wakeup the system. + */ + filter_policy = 0x01; + } + /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support From f0c83a23fcbb424fdff5b38fbcdda3c04003a210 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Fri, 19 Jul 2024 11:30:19 +0800 Subject: [PATCH 0088/1052] Bluetooth: btmtk: Fix btmtk.c undefined reference build error MediaTek moved some usb interface related function to btmtk.c which may cause build failed if BT USB Kconfig wasn't enabled. Fix undefined reference by adding config check. btmtk.c:(.text+0x89c): undefined reference to `usb_alloc_urb' btmtk.c:(.text+0x8e3): undefined reference to `usb_free_urb' btmtk.c:(.text+0x956): undefined reference to `usb_free_urb' btmtk.c:(.text+0xa0e): undefined reference to `usb_anchor_urb' btmtk.c:(.text+0xb43): undefined reference to `usb_autopm_get_interface' btmtk.c:(.text+0xb7e): undefined reference to `usb_autopm_put_interface' btmtk.c:(.text+0xf70): undefined reference to `usb_disable_autosuspend' btmtk.c:(.text+0x133a): undefined reference to `usb_control_msg' Fixes: d019930b0049 ("Bluetooth: btmtk: move btusb_mtk_hci_wmt_sync to btmtk.c") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407091928.AH0aGZnx-lkp@intel.com/ Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 2 ++ drivers/bluetooth/btmtk.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 191bc6925120..2b7c80043aa2 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -437,6 +437,7 @@ int btmtk_process_coredump(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(btmtk_process_coredump); +#if IS_ENABLED(CONFIG_BT_HCIBTUSB_MTK) static void btmtk_usb_wmt_recv(struct urb *urb) { struct hci_dev *hdev = urb->context; @@ -1488,6 +1489,7 @@ int btmtk_usb_shutdown(struct hci_dev *hdev) return 0; } EXPORT_SYMBOL_GPL(btmtk_usb_shutdown); +#endif MODULE_AUTHOR("Sean Wang "); MODULE_AUTHOR("Mark Chen "); diff --git a/drivers/bluetooth/btmtk.h b/drivers/bluetooth/btmtk.h index 5df7c3296624..6fc69cd8636b 100644 --- a/drivers/bluetooth/btmtk.h +++ b/drivers/bluetooth/btmtk.h @@ -202,6 +202,7 @@ int btmtk_process_coredump(struct hci_dev *hdev, struct sk_buff *skb); void btmtk_fw_get_filename(char *buf, size_t size, u32 dev_id, u32 fw_ver, u32 fw_flavor); +#if IS_ENABLED(CONFIG_BT_HCIBTUSB_MTK) int btmtk_usb_subsys_reset(struct hci_dev *hdev, u32 dev_id); int btmtk_usb_recv_acl(struct hci_dev *hdev, struct sk_buff *skb); @@ -216,6 +217,7 @@ int btmtk_usb_suspend(struct hci_dev *hdev); int btmtk_usb_setup(struct hci_dev *hdev); int btmtk_usb_shutdown(struct hci_dev *hdev); +#endif #else static inline int btmtk_set_bdaddr(struct hci_dev *hdev, From 61f7a8f975456d7be21100ee0936389142b95a81 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2024 11:27:05 +0200 Subject: [PATCH 0089/1052] Bluetooth: btmtk: Fix btmtk.c undefined reference build error harder The previous fix was incomplete as the link failure still persists with CONFIG_USB=m when the sdio or serial wrappers for btmtk.c are build-in: btmtk.c:(.text+0x468): undefined reference to `usb_alloc_urb' btmtk.c:(.text+0x488): undefined reference to `usb_free_urb' btmtk.c:(.text+0x500): undefined reference to `usb_anchor_urb' btmtk.c:(.text+0x50a): undefined reference to `usb_submit_urb' btmtk.c:(.text+0x92c): undefined reference to `usb_control_msg' btmtk.c:(.text+0xa92): undefined reference to `usb_unanchor_urb' btmtk.c:(.text+0x11e4): undefined reference to `usb_set_interface' btmtk.c:(.text+0x120a): undefined reference to `usb_kill_anchored_urbs' Disallow this configuration. Fixes: f0c83a23fcbb ("Bluetooth: btmtk: Fix btmtk.c undefined reference build error") Signed-off-by: Arnd Bergmann Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index 90a94a111e67..769fa288179d 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -413,6 +413,7 @@ config BT_ATH3K config BT_MTKSDIO tristate "MediaTek HCI SDIO driver" depends on MMC + depends on USB || !BT_HCIBTUSB_MTK select BT_MTK help MediaTek Bluetooth HCI SDIO driver. @@ -425,6 +426,7 @@ config BT_MTKSDIO config BT_MTKUART tristate "MediaTek HCI UART driver" depends on SERIAL_DEV_BUS + depends on USB || !BT_HCIBTUSB_MTK select BT_MTK help MediaTek Bluetooth HCI UART driver. From 7a8c6fb21a7c913ddb99785b14914dab2f934fbd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2024 11:27:06 +0200 Subject: [PATCH 0090/1052] Bluetooth: btmtk: remove #ifdef around declarations The caller of these functions in btusb.c is guarded with an if(IS_ENABLED()) style check, so dead code is left out, but the declarations are still needed at compile time: drivers/bluetooth/btusb.c: In function 'btusb_mtk_reset': drivers/bluetooth/btusb.c:2705:15: error: implicit declaration of function 'btmtk_usb_subsys_reset' [-Wimplicit-function-declaration] 2705 | err = btmtk_usb_subsys_reset(hdev, btmtk_data->dev_id); | ^~~~~~~~~~~~~~~~~~~~~~ drivers/bluetooth/btusb.c: In function 'btusb_send_frame_mtk': drivers/bluetooth/btusb.c:2720:23: error: implicit declaration of function 'alloc_mtk_intr_urb' [-Wimplicit-function-declaration] 2720 | urb = alloc_mtk_intr_urb(hdev, skb, btusb_tx_complete); | ^~~~~~~~~~~~~~~~~~ drivers/bluetooth/btusb.c:2720:21: error: assignment to 'struct urb *' from 'int' makes pointer from integer without a cast [-Wint-conversion] 2720 | urb = alloc_mtk_intr_urb(hdev, skb, btusb_tx_complete); | ^ Fixes: f0c83a23fcbb ("Bluetooth: btmtk: Fix btmtk.c undefined reference build error") Signed-off-by: Arnd Bergmann Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/bluetooth/btmtk.h b/drivers/bluetooth/btmtk.h index 6fc69cd8636b..5df7c3296624 100644 --- a/drivers/bluetooth/btmtk.h +++ b/drivers/bluetooth/btmtk.h @@ -202,7 +202,6 @@ int btmtk_process_coredump(struct hci_dev *hdev, struct sk_buff *skb); void btmtk_fw_get_filename(char *buf, size_t size, u32 dev_id, u32 fw_ver, u32 fw_flavor); -#if IS_ENABLED(CONFIG_BT_HCIBTUSB_MTK) int btmtk_usb_subsys_reset(struct hci_dev *hdev, u32 dev_id); int btmtk_usb_recv_acl(struct hci_dev *hdev, struct sk_buff *skb); @@ -217,7 +216,6 @@ int btmtk_usb_suspend(struct hci_dev *hdev); int btmtk_usb_setup(struct hci_dev *hdev); int btmtk_usb_shutdown(struct hci_dev *hdev); -#endif #else static inline int btmtk_set_bdaddr(struct hci_dev *hdev, From df3d6a3e01fd82cb74b6bb309f7be71e728a3448 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 25 Jul 2024 18:28:08 -0400 Subject: [PATCH 0091/1052] Bluetooth: hci_event: Fix setting DISCOVERY_FINDING for passive scanning DISCOVERY_FINDING shall only be set for active scanning as passive scanning is not meant to generate MGMT Device Found events causing discovering state to go out of sync since userspace would believe it is discovering when in fact it is just passive scanning. Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=219088 Fixes: 2e2515c1ba38 ("Bluetooth: hci_event: Set DISCOVERY_FINDING on SCAN_ENABLED") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 7 ------- net/bluetooth/hci_event.c | 5 +++-- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 8a4ebd93adfc..06da8ac13dca 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -119,13 +119,6 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state) case DISCOVERY_STARTING: break; case DISCOVERY_FINDING: - /* If discovery was not started then it was initiated by the - * MGMT interface so no MGMT event shall be generated either - */ - if (old_state != DISCOVERY_STARTING) { - hdev->discovery.state = old_state; - return; - } mgmt_discovering(hdev, 1); break; case DISCOVERY_RESOLVING: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index dce8035ca799..d0c118c47f6c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1721,9 +1721,10 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable) switch (enable) { case LE_SCAN_ENABLE: hci_dev_set_flag(hdev, HCI_LE_SCAN); - if (hdev->le_scan_type == LE_SCAN_ACTIVE) + if (hdev->le_scan_type == LE_SCAN_ACTIVE) { clear_pending_adv_report(hdev); - hci_discovery_set_state(hdev, DISCOVERY_FINDING); + hci_discovery_set_state(hdev, DISCOVERY_FINDING); + } break; case LE_SCAN_DISABLE: From 0005ca2076ad222de34de519c47b0eb80f458877 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 24 Jul 2024 12:05:09 -0700 Subject: [PATCH 0092/1052] KVM: x86: Eliminate log spam from limited APIC timer periods SAP's vSMP MemoryONE continuously requests a local APIC timer period less than 500 us, resulting in the following kernel log spam: kvm: vcpu 15: requested 70240 ns lapic timer period limited to 500000 ns kvm: vcpu 19: requested 52848 ns lapic timer period limited to 500000 ns kvm: vcpu 15: requested 70256 ns lapic timer period limited to 500000 ns kvm: vcpu 9: requested 70256 ns lapic timer period limited to 500000 ns kvm: vcpu 9: requested 70208 ns lapic timer period limited to 500000 ns kvm: vcpu 9: requested 387520 ns lapic timer period limited to 500000 ns kvm: vcpu 9: requested 70160 ns lapic timer period limited to 500000 ns kvm: vcpu 66: requested 205744 ns lapic timer period limited to 500000 ns kvm: vcpu 9: requested 70224 ns lapic timer period limited to 500000 ns kvm: vcpu 9: requested 70256 ns lapic timer period limited to 500000 ns limit_periodic_timer_frequency: 7569 callbacks suppressed ... To eliminate this spam, change the pr_info_ratelimited() in limit_periodic_timer_frequency() to pr_info_once(). Reported-by: James Houghton Signed-off-by: Jim Mattson Message-ID: <20240724190640.2449291-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a7172ba59ad2..4915acdbfcd8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1743,7 +1743,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) s64 min_period = min_timer_period_us * 1000LL; if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( + pr_info_once( "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, From c2adcf051be01d3da1e138c178a680514299461b Mon Sep 17 00:00:00 2001 From: Chang Yu Date: Tue, 23 Jul 2024 20:40:06 -0700 Subject: [PATCH 0093/1052] KVM: Documentation: Fix title underline too short warning Fix "WARNING: Title underline too short" by extending title line to the proper length. Signed-off-by: Chang Yu Message-ID: Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 8e5dad80b337..ec1cd8aa1d56 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6365,7 +6365,7 @@ a single guest_memfd file, but the bound ranges must not overlap). See KVM_SET_USER_MEMORY_REGION2 for additional details. 4.143 KVM_PRE_FAULT_MEMORY ------------------------- +--------------------------- :Capability: KVM_CAP_PRE_FAULT_MEMORY :Architectures: none From 4c17736689ccfc44ec7dcc472577f25c34cf8724 Mon Sep 17 00:00:00 2001 From: Casey Chen Date: Mon, 22 Jul 2024 15:15:48 -0600 Subject: [PATCH 0094/1052] perf tool: fix dereferencing NULL al->maps With 0dd5041c9a0e ("perf addr_location: Add init/exit/copy functions"), when cpumode is 3 (macro PERF_RECORD_MISC_HYPERVISOR), thread__find_map() could return with al->maps being NULL. The path below could add a callchain_cursor_node with NULL ms.maps. add_callchain_ip() thread__find_symbol(.., &al) thread__find_map(.., &al) // al->maps becomes NULL ms.maps = maps__get(al.maps) callchain_cursor_append(..., &ms, ...) node->ms.maps = maps__get(ms->maps) Then the path below would dereference NULL maps and get segfault. fill_callchain_info() maps__machine(node->ms.maps); Fix it by checking if maps is NULL in fill_callchain_info(). Fixes: 0dd5041c9a0e ("perf addr_location: Add init/exit/copy functions") Signed-off-by: Casey Chen Reviewed-by: Ian Rogers Reviewed-by: Arnaldo Carvalho de Melo Acked-by: Namhyung Kim Cc: yzhong@purestorage.com Link: https://lore.kernel.org/r/20240722211548.61455-1-cachen@purestorage.com Signed-off-by: Namhyung Kim --- tools/perf/util/callchain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 1730b852a947..6d075648d2cc 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1141,7 +1141,7 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node, bool hide_unresolved) { - struct machine *machine = maps__machine(node->ms.maps); + struct machine *machine = node->ms.maps ? maps__machine(node->ms.maps) : NULL; maps__put(al->maps); al->maps = maps__get(node->ms.maps); From 440cf77625e300e683ca0edc39fbc4b6f3175feb Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:06 +0100 Subject: [PATCH 0095/1052] perf: build: Setup PKG_CONFIG_LIBDIR for cross compilation On recent Linux distros like Ubuntu Noble and Debian Bookworm, the 'pkg-config-aarch64-linux-gnu' package is missing. As a result, the aarch64-linux-gnu-pkg-config command is not available, which causes build failures. When a build passes the environment variables PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH, like a user uses make command or a build system (like Yocto, Buildroot, etc) prepares the variables and passes to the Perf's Makefile, the commit keeps these variables for package configuration. Otherwise, this commit sets the PKG_CONFIG_LIBDIR variable to use the Multiarch libs for the cross compilation. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-2-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 25 ++++++++++++++++++++++++- tools/perf/Makefile.perf | 27 ++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index ed54cef450f5..dff65d03d30d 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -82,7 +82,30 @@ FILES= \ FILES := $(addprefix $(OUTPUT),$(FILES)) -PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +# Some distros provide the command $(CROSS_COMPILE)pkg-config for +# searching packges installed with Multiarch. Use it for cross +# compilation if it is existed. +ifneq (, $(shell which $(CROSS_COMPILE)pkg-config)) + PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +else + PKG_CONFIG ?= pkg-config + + # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR + # for modified system root, are required for the cross compilation. + # If these PKG_CONFIG environment variables are not set, Multiarch library + # paths are used instead. + ifdef CROSS_COMPILE + ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) + CROSS_ARCH = $(shell $(CC) -dumpmachine) + PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/ + export PKG_CONFIG_LIBDIR + endif + endif +endif all: $(FILES) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 175e4c7898f0..f8148db5fc38 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -193,7 +193,32 @@ HOSTLD ?= ld HOSTAR ?= ar CLANG ?= clang -PKG_CONFIG = $(CROSS_COMPILE)pkg-config +# Some distros provide the command $(CROSS_COMPILE)pkg-config for +# searching packges installed with Multiarch. Use it for cross +# compilation if it is existed. +ifneq (, $(shell which $(CROSS_COMPILE)pkg-config)) + PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +else + PKG_CONFIG ?= pkg-config + + # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR + # for modified system root, is required for the cross compilation. + # If these PKG_CONFIG environment variables are not set, Multiarch library + # paths are used instead. + ifdef CROSS_COMPILE + ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) + CROSS_ARCH = $(shell $(CC) -dumpmachine) + PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/ + export PKG_CONFIG_LIBDIR + $(warning Missing PKG_CONFIG_LIBDIR, PKG_CONFIG_PATH and PKG_CONFIG_SYSROOT_DIR for cross compilation,) + $(warning set PKG_CONFIG_LIBDIR for using Multiarch libs.) + endif + endif +endif RM = rm -f LN = ln -f From cffe29d3b54aa0437bc35440ea64866bbfc418a3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:07 +0100 Subject: [PATCH 0096/1052] perf: build: Set Python configuration for cross compilation Python configuration has dedicated folders for different architectures. For example, Python 3.11 has two folders as shown below, one for Arm64 and another for x86_64: /usr/lib/python3.11/config-3.11-aarch64-linux-gnu/ /usr/lib/python3.11/config-3.11-x86_64-linux-gnu/ This commit updates the Python configuration path based on the compiler's machine type, guiding the compiler to find the correct path for Python libraries. It also renames the generated .so file name to match the machine name. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-3-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/perf/Makefile.config | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index c896babf7a74..524a68032bdb 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -296,6 +296,11 @@ endif ifdef PYTHON_CONFIG PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) $(PYTHON_CONFIG_LDFLAGS) 2>/dev/null) + # Update the python flags for cross compilation + ifdef CROSS_COMPILE + PYTHON_NATIVE := $(shell echo $(PYTHON_EMBED_LDOPTS) | sed 's/\(-L.*\/\)\(.*-linux-gnu\).*/\2/') + PYTHON_EMBED_LDOPTS := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EMBED_LDOPTS)) + endif PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS)) PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --includes 2>/dev/null) @@ -897,6 +902,9 @@ else PYTHON_SETUPTOOLS_INSTALLED := $(shell $(PYTHON) -c 'import setuptools;' 2> /dev/null && echo "yes" || echo "no") ifeq ($(PYTHON_SETUPTOOLS_INSTALLED), yes) PYTHON_EXTENSION_SUFFIX := $(shell $(PYTHON) -c 'from importlib import machinery; print(machinery.EXTENSION_SUFFIXES[0])') + ifdef CROSS_COMPILE + PYTHON_EXTENSION_SUFFIX := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EXTENSION_SUFFIX)) + endif LANG_BINDINGS += $(obj-perf)python/perf$(PYTHON_EXTENSION_SUFFIX) else $(warning Missing python setuptools, the python binding won't be built, please install python3-setuptools or equivalent) From 536661da6ea18fe6df5740bc9e9001d097b035ee Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:08 +0100 Subject: [PATCH 0097/1052] perf: build: Only link libebl.a for old libdw Since libdw version 0.177, elfutils has merged libebl.a into libdw (see the commit "libebl: Don't install libebl.a, libebl.h and remove backends from spec." in the elfutils repository). As a result, libebl.a does not exist on Debian Bullseye and newer releases, causing static perf builds to fail on these distributions. This commit checks the libdw version and only links libebl.a if it detects that the libdw version is older than 0.177. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-4-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 12 +++++++++++- tools/perf/Makefile.config | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index dff65d03d30d..08b2be257639 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -170,7 +170,17 @@ $(OUTPUT)test-libopencsd.bin: DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) -DWARFLIBS += -lelf -lebl -lz -llzma -lbz2 + DWARFLIBS += -lelf -lz -llzma -lbz2 + + LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) + LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) + LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION))) + + # Elfutils merged libebl.a into libdw.a starting from version 0.177, + # Link libebl.a only if libdw is older than this version. + ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0) + DWARFLIBS += -lebl + endif endif $(OUTPUT)test-dwarf.bin: diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 524a68032bdb..7cf0a39bc200 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -152,7 +152,17 @@ ifdef LIBDW_DIR endif DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) - DWARFLIBS += -lelf -lebl -ldl -lz -llzma -lbz2 + DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 + + LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) + LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) + LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION))) + + # Elfutils merged libebl.a into libdw.a starting from version 0.177, + # Link libebl.a only if libdw is older than this version. + ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0) + DWARFLIBS += -lebl + endif endif FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS) FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) $(DWARFLIBS) From 91b6a536b40658c24d6f04747ddf852d30f7f259 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:09 +0100 Subject: [PATCH 0098/1052] perf: build: Link lib 'lzma' for static build The libunwind feature test failed with the static linkage. This is due to the 'lzma' lib is missed, so link it to dismiss building failure. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-5-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 08b2be257639..2d5be5c17d65 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -211,27 +211,27 @@ $(OUTPUT)test-numa_num_possible_cpus.bin: $(BUILD) -lnuma $(OUTPUT)test-libunwind.bin: - $(BUILD) -lelf + $(BUILD) -lelf -llzma $(OUTPUT)test-libunwind-debug-frame.bin: - $(BUILD) -lelf + $(BUILD) -lelf -llzma $(OUTPUT)test-libunwind-x86.bin: - $(BUILD) -lelf -lunwind-x86 + $(BUILD) -lelf -llzma -lunwind-x86 $(OUTPUT)test-libunwind-x86_64.bin: - $(BUILD) -lelf -lunwind-x86_64 + $(BUILD) -lelf -llzma -lunwind-x86_64 $(OUTPUT)test-libunwind-arm.bin: - $(BUILD) -lelf -lunwind-arm + $(BUILD) -lelf -llzma -lunwind-arm $(OUTPUT)test-libunwind-aarch64.bin: - $(BUILD) -lelf -lunwind-aarch64 + $(BUILD) -lelf -llzma -lunwind-aarch64 $(OUTPUT)test-libunwind-debug-frame-arm.bin: - $(BUILD) -lelf -lunwind-arm + $(BUILD) -lelf -llzma -lunwind-arm $(OUTPUT)test-libunwind-debug-frame-aarch64.bin: - $(BUILD) -lelf -lunwind-aarch64 + $(BUILD) -lelf -llzma -lunwind-aarch64 $(OUTPUT)test-libaudit.bin: $(BUILD) -laudit From f42596c73872b753ff1799bc7fc79b1100226da1 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:10 +0100 Subject: [PATCH 0099/1052] perf: build: Link lib 'zstd' for static build When build static perf, Makefile reports the error: Makefile.config:480: No libdw DWARF unwind found, Please install elfutils-devel/libdw-dev >= 0.158 and/or set LIBDW_DIR The libdw has been installed on the system, but the build system fails to build the feature detecting binary 'test-libdw-dwarf-unwind'. The failure is caused by missing to link the lib 'zstd'. Link lib 'zstd' for the static build, in the end, the dwarf feature can be enabled in the static perf. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-6-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 2 +- tools/perf/Makefile.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 2d5be5c17d65..4af6a9582f15 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -170,7 +170,7 @@ $(OUTPUT)test-libopencsd.bin: DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) - DWARFLIBS += -lelf -lz -llzma -lbz2 + DWARFLIBS += -lelf -lz -llzma -lbz2 -lzstd LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 7cf0a39bc200..fa679db61f62 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -152,7 +152,7 @@ ifdef LIBDW_DIR endif DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) - DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 + DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 -lzstd LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) From d27087c76e3c859ea05b7581ef7ce8aa5a088dd8 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:11 +0100 Subject: [PATCH 0100/1052] perf docs: Document cross compilation Records the commands for cross compilation with two methods. The first method relies on Multiarch. The second approach is to explicitly specify the PKG_CONFIG variables, which is widely used in build system (like Buildroot, Yocto, etc). Co-developed-by: James Clark Signed-off-by: James Clark Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-7-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/perf/Documentation/Build.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt index 3766886c4bca..83dc87c662b6 100644 --- a/tools/perf/Documentation/Build.txt +++ b/tools/perf/Documentation/Build.txt @@ -71,3 +71,31 @@ supported by GCC. UBSan detects undefined behaviors of programs at runtime. $ UBSAN_OPTIONS=print_stacktrace=1 ./perf record -a If UBSan detects any problem at runtime, it outputs a “runtime error:” message. + +4) Cross compilation +==================== +As Multiarch is commonly supported in Linux distributions, we can install +libraries for multiple architectures on the same system and then cross-compile +Linux perf. For example, Aarch64 libraries and toolchains can be installed on +an x86_64 machine, allowing us to compile perf for an Aarch64 target. + +Below is the command for building the perf with dynamic linking. + + $ cd /path/to/Linux + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +For static linking, the option `LDFLAGS="-static"` is required. + + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \ + LDFLAGS="-static" -C tools/perf + +In the embedded system world, a use case is to explicitly specify the package +configuration paths for cross building: + + $ PKG_CONFIG_SYSROOT_DIR="/path/to/cross/build/sysroot" \ + PKG_CONFIG_LIBDIR="/usr/lib/:/usr/local/lib" \ + make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +In this case, the variable PKG_CONFIG_SYSROOT_DIR can be used alongside the +variable PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH to prepend the sysroot path to +the library paths for cross compilation. From 5932ca411e533e7ad2b97c47b4357a05fa06c2a5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 17 Jul 2024 13:04:48 -0400 Subject: [PATCH 0101/1052] KVM: x86: disallow pre-fault for SNP VMs before initialization KVM_PRE_FAULT_MEMORY for an SNP guest can race with sev_gmem_post_populate() in bad ways. The following sequence for instance can potentially trigger an RMP fault: thread A, sev_gmem_post_populate: called thread B, sev_gmem_prepare: places below 'pfn' in a private state in RMP thread A, sev_gmem_post_populate: *vaddr = kmap_local_pfn(pfn + i); thread A, sev_gmem_post_populate: copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); RMP #PF Fix this by only allowing KVM_PRE_FAULT_MEMORY to run after a guest's initial private memory contents have been finalized via KVM_SEV_SNP_LAUNCH_FINISH. Beyond fixing this issue, it just sort of makes sense to enforce this, since the KVM_PRE_FAULT_MEMORY documentation states: "KVM maps memory as if the vCPU generated a stage-2 read page fault" which sort of implies we should be acting on the same guest state that a vCPU would see post-launch after the initial guest memory is all set up. Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 3 +++ arch/x86/kvm/svm/sev.c | 8 ++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/x86.c | 3 +++ 6 files changed, 22 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index ec1cd8aa1d56..7b512286f8d2 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6402,6 +6402,12 @@ for the current vCPU state. KVM maps memory as if the vCPU generated a stage-2 read page fault, e.g. faults in memory as needed, but doesn't break CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed. +In the case of confidential VM types where there is an initial set up of +private guest memory before the guest is 'finalized'/measured, this ioctl +should only be issued after completing all the necessary setup to put the +guest into a 'finalized' state so that the above semantics can be reliably +ensured. + In some cases, multiple vCPUs might share the page tables. In this case, the ioctl can be called in parallel. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 950a03e0181e..94e7b5a4fafe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1305,6 +1305,7 @@ struct kvm_arch { u8 vm_type; bool has_private_mem; bool has_protected_state; + bool pre_fault_allowed; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 901be9e420a4..26ef5b6ac3c1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4743,6 +4743,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, u64 end; int r; + if (!vcpu->kvm->arch.pre_fault_allowed) + return -EOPNOTSUPP; + /* * reload is efficient when called repeatedly, so we can do it on * every iteration. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a16c873b3232..6589091e8ce0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2549,6 +2549,14 @@ static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) data->gctx_paddr = __psp_pa(sev->snp_context); ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + /* + * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages + * can be given to the guest simply by marking the RMP entry as private. + * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. + */ + if (!ret) + kvm->arch.pre_fault_allowed = true; + kfree(id_auth); e_free_id_block: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c115d26844f7..d6f252555ab3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4949,6 +4949,7 @@ static int svm_vm_init(struct kvm *kvm) to_kvm_sev_info(kvm)->need_init = true; kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); + kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; } if (!pause_filter_count || !pause_filter_thresh) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af6c8cf6a37a..52778689cef4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12646,6 +12646,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.vm_type = type; kvm->arch.has_private_mem = (type == KVM_X86_SW_PROTECTED_VM); + /* Decided by the vendor code for other VM types. */ + kvm->arch.pre_fault_allowed = + type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; ret = kvm_page_track_init(kvm); if (ret) From d0d87226f535965b4dafc6ef79246456503a4503 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:44 -0400 Subject: [PATCH 0102/1052] KVM: guest_memfd: return folio from __kvm_gmem_get_pfn() Right now this is simply more consistent and avoids use of pfn_to_page() and put_page(). It will be put to more use in upcoming patches, to ensure that the up-to-date flag is set at the very end of both the kvm_gmem_get_pfn() and kvm_gmem_populate() flows. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 1c509c351261..522e1b28e7ae 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -541,34 +541,34 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } -static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) +static struct folio * +__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; struct page *page; - int r; if (file != slot->gmem.file) { WARN_ON_ONCE(slot->gmem.file); - return -EFAULT; + return ERR_PTR(-EFAULT); } gmem = file->private_data; if (xa_load(&gmem->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&gmem->bindings, index)); - return -EIO; + return ERR_PTR(-EIO); } folio = kvm_gmem_get_folio(file_inode(file), index, prepare); if (IS_ERR(folio)) - return PTR_ERR(folio); + return folio; if (folio_test_hwpoison(folio)) { folio_unlock(folio); folio_put(folio); - return -EHWPOISON; + return ERR_PTR(-EHWPOISON); } page = folio_file_page(folio, index); @@ -577,25 +577,25 @@ static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, if (max_order) *max_order = 0; - r = 0; - folio_unlock(folio); - - return r; + return folio; } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { struct file *file = kvm_gmem_get_file(slot); - int r; + struct folio *folio; if (!file) return -EFAULT; - r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); + folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); fput(file); - return r; + if (IS_ERR(folio)) + return PTR_ERR(folio); + + return 0; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); @@ -625,6 +625,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i += (1 << max_order)) { + struct folio *folio; gfn_t gfn = start_gfn + i; kvm_pfn_t pfn; @@ -633,9 +634,11 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); - if (ret) + folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; + } if (!IS_ALIGNED(gfn, (1 << max_order)) || (npages - i) < (1 << max_order)) @@ -644,7 +647,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); - put_page(pfn_to_page(pfn)); + folio_put(folio); if (ret) break; } From d04c77d231223563405e8874fa7edfdc65e545fe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:45 -0400 Subject: [PATCH 0103/1052] KVM: guest_memfd: delay folio_mark_uptodate() until after successful preparation The up-to-date flag as is now is not too useful; it tells guest_memfd not to overwrite the contents of a folio, but it doesn't say that the page is ready to be mapped into the guest. For encrypted guests, mapping a private page requires that the "preparation" phase has succeeded, and at the same time the same page cannot be prepared twice. So, ensure that folio_mark_uptodate() is only called on a prepared page. If kvm_gmem_prepare_folio() or the post_populate callback fail, the folio will not be marked up-to-date; it's not a problem to call clear_highpage() again on such a page prior to the next preparation attempt. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 522e1b28e7ae..1ea632dbae57 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -73,8 +73,6 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool for (i = 0; i < nr_pages; i++) clear_highpage(folio_page(folio, i)); - - folio_mark_uptodate(folio); } if (prepare) { @@ -84,6 +82,8 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool folio_put(folio); return ERR_PTR(r); } + + folio_mark_uptodate(folio); } /* @@ -646,6 +646,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + if (!ret) + folio_mark_uptodate(folio); folio_put(folio); if (ret) From 7fbdda31b0a14ff45809fb27182b98dcbcbad5b0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:46 -0400 Subject: [PATCH 0104/1052] KVM: guest_memfd: do not go through struct page We have a perfectly usable folio, use it to retrieve the pfn and order. All that's needed is a version of folio_file_page that returns a pfn. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 1ea632dbae57..5221b584288f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -13,6 +13,18 @@ struct kvm_gmem { struct list_head entry; }; +/** + * folio_file_pfn - like folio_file_page, but return a pfn. + * @folio: The folio which contains this index. + * @index: The index we want to look up. + * + * Return: The pfn for this index. + */ +static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) +{ + return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); +} + static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_GMEM_PREPARE @@ -22,7 +34,6 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol list_for_each_entry(gmem, gmem_list, entry) { struct kvm_memory_slot *slot; struct kvm *kvm = gmem->kvm; - struct page *page; kvm_pfn_t pfn; gfn_t gfn; int rc; @@ -34,13 +45,12 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol if (!slot) continue; - page = folio_file_page(folio, index); - pfn = page_to_pfn(page); + pfn = folio_file_pfn(folio, index); gfn = slot->base_gfn + index - slot->gmem.pgoff; - rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); + rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); if (rc) { - pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", - index, gfn, pfn, rc); + pr_warn_ratelimited("gmem: Failed to prepare folio for GFN %llx PFN %llx error %d.\n", + gfn, pfn, rc); return rc; } } @@ -548,7 +558,6 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; - struct page *page; if (file != slot->gmem.file) { WARN_ON_ONCE(slot->gmem.file); @@ -571,9 +580,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, return ERR_PTR(-EHWPOISON); } - page = folio_file_page(folio, index); - - *pfn = page_to_pfn(page); + *pfn = folio_file_pfn(folio, index); if (max_order) *max_order = 0; From 564429a6bd8d26065b2cccffcaa9485359f74de7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:47 -0400 Subject: [PATCH 0105/1052] KVM: rename CONFIG_HAVE_KVM_GMEM_* to CONFIG_HAVE_KVM_ARCH_GMEM_* Add "ARCH" to the symbols; shortly, the "prepare" phase will include both the arch-independent step to clear out contents left in the page by the host, and the arch-dependent step enabled by CONFIG_HAVE_KVM_GMEM_PREPARE. For consistency do the same for CONFIG_HAVE_KVM_GMEM_INVALIDATE as well. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/Kconfig | 4 ++-- virt/kvm/guest_memfd.c | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4287a8071a3a..472a1537b7a9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -141,8 +141,8 @@ config KVM_AMD_SEV depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM select KVM_GENERIC_PRIVATE_MEM - select HAVE_KVM_GMEM_PREPARE - select HAVE_KVM_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_PREPARE + select HAVE_KVM_ARCH_GMEM_INVALIDATE help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52778689cef4..4db18924b2f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13644,7 +13644,7 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE bool kvm_arch_gmem_prepare_needed(struct kvm *kvm) { return kvm->arch.vm_type == KVM_X86_SNP_VM; @@ -13656,7 +13656,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord } #endif -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { kvm_x86_call(gmem_invalidate)(start, end); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 689e8be873a7..344d90771844 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2445,7 +2445,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); #endif @@ -2477,7 +2477,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index b14e14cdbfb9..fd6a3010afa8 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -113,10 +113,10 @@ config KVM_GENERIC_PRIVATE_MEM select KVM_PRIVATE_MEM bool -config HAVE_KVM_GMEM_PREPARE +config HAVE_KVM_ARCH_GMEM_PREPARE bool depends on KVM_PRIVATE_MEM -config HAVE_KVM_GMEM_INVALIDATE +config HAVE_KVM_ARCH_GMEM_INVALIDATE bool depends on KVM_PRIVATE_MEM diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 5221b584288f..76139332f2f3 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -27,7 +27,7 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) { -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE struct list_head *gmem_list = &inode->i_mapping->i_private_list; struct kvm_gmem *gmem; @@ -353,7 +353,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol return MF_DELAYED; } -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE static void kvm_gmem_free_folio(struct folio *folio) { struct page *page = folio_page(folio, 0); @@ -368,7 +368,7 @@ static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE .free_folio = kvm_gmem_free_folio, #endif }; From 78c4293372fe1fe6060727dfbd5643552e3ff86d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:48 -0400 Subject: [PATCH 0106/1052] KVM: guest_memfd: return locked folio from __kvm_gmem_get_pfn Allow testing the up-to-date flag in the caller without taking the lock again. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 76139332f2f3..9271aba9b7b3 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -59,6 +59,7 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol return 0; } +/* Returns a locked folio on success. */ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) { struct folio *folio; @@ -551,6 +552,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } +/* Returns a locked folio on success. */ static struct folio * __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) @@ -584,7 +586,6 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, if (max_order) *max_order = 0; - folio_unlock(folio); return folio; } @@ -602,6 +603,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, if (IS_ERR(folio)) return PTR_ERR(folio); + folio_unlock(folio); return 0; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); @@ -647,6 +649,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } + folio_unlock(folio); if (!IS_ALIGNED(gfn, (1 << max_order)) || (npages - i) < (1 << max_order)) max_order = 0; From b85524314a3db687a87b190fd878aa1206b94b52 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:49 -0400 Subject: [PATCH 0107/1052] KVM: guest_memfd: delay kvm_gmem_prepare_folio() until the memory is passed to the guest Initializing the contents of the folio on fallocate() is unnecessarily restrictive. It means that the page is registered with the firmware and then it cannot be touched anymore. In particular, this loses the possibility of using fallocate() to pre-allocate the page for SEV-SNP guests, because kvm_arch_gmem_prepare() then fails. It's only when the guest actually accesses the page (and therefore kvm_gmem_get_pfn() is called) that the page must be cleared from any stale host data and registered with the firmware. The up-to-date flag is clear if this has to be done (i.e. it is the first access and kvm_gmem_populate() has not been called). All in all, there are enough differences between kvm_gmem_get_pfn() and kvm_gmem_populate(), that it's better to separate the two flows completely. Extract the bulk of kvm_gmem_get_folio(), which take a folio and end up setting its up-to-date flag, to a new function kvm_gmem_prepare_folio(); these are now done only by the non-__-prefixed kvm_gmem_get_pfn(). As a bonus, __kvm_gmem_get_pfn() loses its ugly "bool prepare" argument. One difference is that fallocate(PUNCH_HOLE) can now race with a page fault. Potentially this causes a page to be prepared and into the filemap even after fallocate(PUNCH_HOLE). This is harmless, as it can be fixed by another hole punching operation, and can be avoided by clearing the private-page attribute prior to invoking fallocate(PUNCH_HOLE). This way, the page fault will cause an exit to user space. The previous semantics, where fallocate() could be used to prepare the pages in advance of running the guest, can be accessed with KVM_PRE_FAULT_MEMORY. For now, accessing a page in one VM will attempt to call kvm_arch_gmem_prepare() in all of those that have bound the guest_memfd. Cleaning this up is left to a separate patch. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 110 ++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 44 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 9271aba9b7b3..5af278c7adba 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -25,7 +25,7 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } -static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) +static int __kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE struct list_head *gmem_list = &inode->i_mapping->i_private_list; @@ -59,49 +59,63 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol return 0; } -/* Returns a locked folio on success. */ -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) +/* + * Process @folio, which contains @gfn, so that the guest can use it. + * The folio must be locked and the gfn must be contained in @slot. + * On successful return the guest sees a zero page so as to avoid + * leaking host data and the up-to-date flag is set. + */ +static int kvm_gmem_prepare_folio(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, struct folio *folio) { - struct folio *folio; + unsigned long nr_pages, i; + pgoff_t index; + int r; - /* TODO: Support huge pages. */ - folio = filemap_grab_folio(inode->i_mapping, index); - if (IS_ERR(folio)) - return folio; + if (folio_test_uptodate(folio)) + return 0; + + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); /* - * Use the up-to-date flag to track whether or not the memory has been - * zeroed before being handed off to the guest. There is no backing - * storage for the memory, so the folio will remain up-to-date until - * it's removed. + * Preparing huge folios should always be safe, since it should + * be possible to split them later if needed. * - * TODO: Skip clearing pages when trusted firmware will do it when - * assigning memory to the guest. + * Right now the folio order is always going to be zero, but the + * code is ready for huge folios. The only assumption is that + * the base pgoff of memslots is naturally aligned with the + * requested page order, ensuring that huge folios can also use + * huge page table entries for GPA->HPA mapping. + * + * The order will be passed when creating the guest_memfd, and + * checked when creating memslots. */ - if (!folio_test_uptodate(folio)) { - unsigned long nr_pages = folio_nr_pages(folio); - unsigned long i; - - for (i = 0; i < nr_pages; i++) - clear_highpage(folio_page(folio, i)); - } - - if (prepare) { - int r = kvm_gmem_prepare_folio(inode, index, folio); - if (r < 0) { - folio_unlock(folio); - folio_put(folio); - return ERR_PTR(r); - } + WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); + index = gfn - slot->base_gfn + slot->gmem.pgoff; + index = ALIGN_DOWN(index, 1 << folio_order(folio)); + r = __kvm_gmem_prepare_folio(file_inode(file), index, folio); + if (!r) folio_mark_uptodate(folio); - } - /* - * Ignore accessed, referenced, and dirty flags. The memory is - * unevictable and there is no storage to write back to. - */ - return folio; + return r; +} + +/* + * Returns a locked folio on success. The caller is responsible for + * setting the up-to-date flag before the memory is mapped into the guest. + * There is no backing storage for the memory, so the folio will remain + * up-to-date until it's removed. + * + * Ignore accessed, referenced, and dirty flags. The memory is + * unevictable and there is no storage to write back to. + */ +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +{ + /* TODO: Support huge pages. */ + return filemap_grab_folio(inode->i_mapping, index); } static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, @@ -201,7 +215,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) break; } - folio = kvm_gmem_get_folio(inode, index, true); + folio = kvm_gmem_get_folio(inode, index); if (IS_ERR(folio)) { r = PTR_ERR(folio); break; @@ -555,7 +569,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) /* Returns a locked folio on success. */ static struct folio * __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; @@ -572,7 +586,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, return ERR_PTR(-EIO); } - folio = kvm_gmem_get_folio(file_inode(file), index, prepare); + folio = kvm_gmem_get_folio(file_inode(file), index); if (IS_ERR(folio)) return folio; @@ -594,17 +608,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, { struct file *file = kvm_gmem_get_file(slot); struct folio *folio; + int r = 0; if (!file) return -EFAULT; - folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); - fput(file); - if (IS_ERR(folio)) - return PTR_ERR(folio); + folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order); + if (IS_ERR(folio)) { + r = PTR_ERR(folio); + goto out; + } + r = kvm_gmem_prepare_folio(file, slot, gfn, folio); folio_unlock(folio); - return 0; + if (r < 0) + folio_put(folio); + +out: + fput(file); + return r; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); @@ -643,7 +665,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); + folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; From 6dd761d92f6600342860c618639489bb9c3843ea Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:50 -0400 Subject: [PATCH 0108/1052] KVM: guest_memfd: make kvm_gmem_prepare_folio() operate on a single struct kvm This is now possible because preparation is done by kvm_gmem_get_pfn() instead of fallocate(). In practice this is not a limitation, because even though guest_memfd can be bound to multiple struct kvm, for hardware implementations of confidential computing only one guest (identified by an ASID on SEV-SNP, or an HKID on TDX) will be able to access it. In the case of intra-host migration (not implemented yet for SEV-SNP, but we can use SEV-ES as an idea of how it will work), the new struct kvm inherits the same ASID and preparation need not be repeated. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 47 ++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 5af278c7adba..444ded162154 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -25,37 +25,27 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } -static int __kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) +static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, + pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE - struct list_head *gmem_list = &inode->i_mapping->i_private_list; - struct kvm_gmem *gmem; + kvm_pfn_t pfn; + gfn_t gfn; + int rc; - list_for_each_entry(gmem, gmem_list, entry) { - struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; - kvm_pfn_t pfn; - gfn_t gfn; - int rc; + if (!kvm_arch_gmem_prepare_needed(kvm)) + return 0; - if (!kvm_arch_gmem_prepare_needed(kvm)) - continue; - - slot = xa_load(&gmem->bindings, index); - if (!slot) - continue; - - pfn = folio_file_pfn(folio, index); - gfn = slot->base_gfn + index - slot->gmem.pgoff; - rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); - if (rc) { - pr_warn_ratelimited("gmem: Failed to prepare folio for GFN %llx PFN %llx error %d.\n", - gfn, pfn, rc); - return rc; - } + pfn = folio_file_pfn(folio, index); + gfn = slot->base_gfn + index - slot->gmem.pgoff; + rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); + if (rc) { + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", + index, gfn, pfn, rc); + return rc; } - #endif + return 0; } @@ -65,7 +55,7 @@ static int __kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct f * On successful return the guest sees a zero page so as to avoid * leaking host data and the up-to-date flag is set. */ -static int kvm_gmem_prepare_folio(struct file *file, struct kvm_memory_slot *slot, +static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, struct folio *folio) { unsigned long nr_pages, i; @@ -95,8 +85,7 @@ static int kvm_gmem_prepare_folio(struct file *file, struct kvm_memory_slot *slo WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); index = gfn - slot->base_gfn + slot->gmem.pgoff; index = ALIGN_DOWN(index, 1 << folio_order(folio)); - - r = __kvm_gmem_prepare_folio(file_inode(file), index, folio); + r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) folio_mark_uptodate(folio); @@ -619,7 +608,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, goto out; } - r = kvm_gmem_prepare_folio(file, slot, gfn, folio); + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); if (r < 0) folio_put(folio); From 7239ed74677af143857d1a96d402476446a0995a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:51 -0400 Subject: [PATCH 0109/1052] KVM: remove kvm_arch_gmem_prepare_needed() It is enough to return 0 if a guest need not do any preparation. This is in fact how sev_gmem_prepare() works for non-SNP guests, and it extends naturally to Intel hosts: the x86 callback for gmem_prepare is optional and returns 0 if not defined. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 ----- include/linux/kvm_host.h | 1 - virt/kvm/guest_memfd.c | 13 +++---------- 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4db18924b2f2..ef3d3511e4af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13645,11 +13645,6 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_arch_no_poll); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE -bool kvm_arch_gmem_prepare_needed(struct kvm *kvm) -{ - return kvm->arch.vm_type == KVM_X86_SNP_VM; -} - int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 344d90771844..45373d42f314 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2447,7 +2447,6 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); -bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); #endif /** diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 444ded162154..191fd42067c0 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -29,16 +29,9 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE - kvm_pfn_t pfn; - gfn_t gfn; - int rc; - - if (!kvm_arch_gmem_prepare_needed(kvm)) - return 0; - - pfn = folio_file_pfn(folio, index); - gfn = slot->base_gfn + index - slot->gmem.pgoff; - rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); + kvm_pfn_t pfn = folio_file_pfn(folio, index); + gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; + int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); if (rc) { pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", index, gfn, pfn, rc); From de80252414f32db31eaa14baef511e9bd96021cd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:52 -0400 Subject: [PATCH 0110/1052] KVM: guest_memfd: move check for already-populated page to common code Do not allow populating the same page twice with startup data. In the case of SEV-SNP, for example, the firmware does not allow it anyway, since the launch-update operation is only possible on pages that are still shared in the RMP. Even if it worked, kvm_gmem_populate()'s callback is meant to have side effects such as updating launch measurements, and updating the same page twice is unlikely to have the desired results. Races between calls to the ioctl are not possible because kvm_gmem_populate() holds slots_lock and the VM should not be running. But again, even if this worked on other confidential computing technology, it doesn't matter to guest_memfd.c whether this is something fishy such as missing synchronization in userspace, or rather something intentional. One of the racers wins, and the page is initialized by either kvm_gmem_prepare_folio() or kvm_gmem_populate(). Anyway, out of paranoia, adjust sev_gmem_post_populate() anyway to use the same errno that kvm_gmem_populate() is using. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- virt/kvm/guest_memfd.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6589091e8ce0..752d2fff0f10 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2290,7 +2290,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf if (ret || assigned) { pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", __func__, gfn, ret, assigned); - ret = -EINVAL; + ret = ret ? -EINVAL : -EEXIST; goto err; } diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 191fd42067c0..319ec491ca47 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -653,6 +653,13 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); + ret = -EEXIST; + break; + } + folio_unlock(folio); if (!IS_ALIGNED(gfn, (1 << max_order)) || (npages - i) < (1 << max_order)) From e300614f10bd2f33252c8ba40b34d6c3fbf95d72 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:53 -0400 Subject: [PATCH 0111/1052] KVM: cleanup and add shortcuts to kvm_range_has_memory_attributes() Use a guard to simplify early returns, and add two more easy shortcuts. If the requested attributes are invalid, the attributes xarray will never show them as set. And if testing a single page, kvm_get_memory_attributes() is more efficient. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d0788d0a72cc..30328ff2d840 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2398,6 +2398,14 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static u64 kvm_supported_mem_attributes(struct kvm *kvm) +{ + if (!kvm || kvm_arch_has_private_mem(kvm)) + return KVM_MEMORY_ATTRIBUTE_PRIVATE; + + return 0; +} + /* * Returns true if _all_ gfns in the range [@start, @end) have attributes * matching @attrs. @@ -2406,40 +2414,30 @@ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long attrs) { XA_STATE(xas, &kvm->mem_attr_array, start); + unsigned long mask = kvm_supported_mem_attributes(kvm); unsigned long index; - bool has_attrs; void *entry; - rcu_read_lock(); + if (attrs & ~mask) + return false; - if (!attrs) { - has_attrs = !xas_find(&xas, end - 1); - goto out; - } + if (end == start + 1) + return kvm_get_memory_attributes(kvm, start) == attrs; + + guard(rcu)(); + if (!attrs) + return !xas_find(&xas, end - 1); - has_attrs = true; for (index = start; index < end; index++) { do { entry = xas_next(&xas); } while (xas_retry(&xas, entry)); - if (xas.xa_index != index || xa_to_value(entry) != attrs) { - has_attrs = false; - break; - } + if (xas.xa_index != index || xa_to_value(entry) != attrs) + return false; } -out: - rcu_read_unlock(); - return has_attrs; -} - -static u64 kvm_supported_mem_attributes(struct kvm *kvm) -{ - if (!kvm || kvm_arch_has_private_mem(kvm)) - return KVM_MEMORY_ATTRIBUTE_PRIVATE; - - return 0; + return true; } static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, From 4b5f67120a88c713b82907d55a767693382e9e9d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:54 -0400 Subject: [PATCH 0112/1052] KVM: extend kvm_range_has_memory_attributes() to check subset of attributes While currently there is no other attribute than KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM code such as kvm_mem_is_private() is written to expect their existence. Allow using kvm_range_has_memory_attributes() as a multi-page version of kvm_mem_is_private(), without it breaking later when more attributes are introduced. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 13 +++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 26ef5b6ac3c1..43a02891babb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7513,7 +7513,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); if (level == PG_LEVEL_2M) - return kvm_range_has_memory_attributes(kvm, start, end, attrs); + return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs); for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { if (hugepage_test_mixed(slot, gfn, level - 1) || diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 45373d42f314..c223b97df03e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2414,7 +2414,7 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn } bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, - unsigned long attrs); + unsigned long mask, unsigned long attrs); bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 30328ff2d840..92901656a0d4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2408,21 +2408,21 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm) /* * Returns true if _all_ gfns in the range [@start, @end) have attributes - * matching @attrs. + * such that the bits in @mask match @attrs. */ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, - unsigned long attrs) + unsigned long mask, unsigned long attrs) { XA_STATE(xas, &kvm->mem_attr_array, start); - unsigned long mask = kvm_supported_mem_attributes(kvm); unsigned long index; void *entry; + mask &= kvm_supported_mem_attributes(kvm); if (attrs & ~mask) return false; if (end == start + 1) - return kvm_get_memory_attributes(kvm, start) == attrs; + return (kvm_get_memory_attributes(kvm, start) & mask) == attrs; guard(rcu)(); if (!attrs) @@ -2433,7 +2433,8 @@ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, entry = xas_next(&xas); } while (xas_retry(&xas, entry)); - if (xas.xa_index != index || xa_to_value(entry) != attrs) + if (xas.xa_index != index || + (xa_to_value(entry) & mask) != attrs) return false; } @@ -2532,7 +2533,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, mutex_lock(&kvm->slots_lock); /* Nothing to do if the entire range as the desired attributes. */ - if (kvm_range_has_memory_attributes(kvm, start, end, attributes)) + if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes)) goto out_unlock; /* From e4ee5447927377c55777b73fe497a2455a25f948 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:55 -0400 Subject: [PATCH 0113/1052] KVM: guest_memfd: let kvm_gmem_populate() operate only on private gfns This check is currently performed by sev_gmem_post_populate(), but it applies to all callers of kvm_gmem_populate(): the point of the function is that the memory is being encrypted and some work has to be done on all the gfns in order to encrypt them. Therefore, check the KVM_MEMORY_ATTRIBUTE_PRIVATE attribute prior to invoking the callback, and stop the operation if a shared page is encountered. Because CONFIG_KVM_PRIVATE_MEM in principle does not require attributes, this makes kvm_gmem_populate() depend on CONFIG_KVM_GENERIC_PRIVATE_MEM (which does require them). Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 7 ------- include/linux/kvm_host.h | 2 ++ virt/kvm/guest_memfd.c | 12 ++++++++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 752d2fff0f10..532df12b43c5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2279,13 +2279,6 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf bool assigned; int level; - if (!kvm_mem_is_private(kvm, gfn)) { - pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n", - __func__, gfn); - ret = -EINVAL; - goto err; - } - ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); if (ret || assigned) { pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c223b97df03e..79a6b1a63027 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2449,6 +2449,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif +#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * @@ -2475,6 +2476,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); +#endif #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 319ec491ca47..95e338a29910 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -612,6 +612,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); +#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { @@ -665,11 +666,21 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long (npages - i) < (1 << max_order)) max_order = 0; + ret = -EINVAL; + while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + if (!max_order) + goto put_folio_and_exit; + max_order--; + } + p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); if (!ret) folio_mark_uptodate(folio); +put_folio_and_exit: folio_put(folio); if (ret) break; @@ -681,3 +692,4 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long return ret && !i ? ret : i; } EXPORT_SYMBOL_GPL(kvm_gmem_populate); +#endif From 66a644c09fbed0a19f3708895c1180df78181019 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 26 Jul 2024 13:45:36 -0400 Subject: [PATCH 0114/1052] KVM: guest_memfd: abstract how prepared folios are recorded Right now, large folios are not supported in guest_memfd, and therefore the order used by kvm_gmem_populate() is always 0. In this scenario, using the up-to-date bit to track prepared-ness is nice and easy because we have one bit available per page. In the future, however, we might have large pages that are partially populated; for example, in the case of SEV-SNP, if a large page has both shared and private areas inside, it is necessary to populate it at a granularity that is smaller than that of the guest_memfd's backing store. In that case we will have to track preparedness at a 4K level, probably as a bitmap. In preparation for that, do not use explicitly folio_test_uptodate() and folio_mark_uptodate(). Return the state of the page directly from __kvm_gmem_get_pfn(), so that it is expected to apply to 2^N pages with N=*max_order. The function to mark a range as prepared for now takes just a folio, but is expected to take also an index and order (or something like that) when large pages are introduced. Thanks to Michael Roth for pointing out the issue with large pages. Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 95e338a29910..8f079a61a56d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -42,6 +42,11 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo return 0; } +static inline void kvm_gmem_mark_prepared(struct folio *folio) +{ + folio_mark_uptodate(folio); +} + /* * Process @folio, which contains @gfn, so that the guest can use it. * The folio must be locked and the gfn must be contained in @slot. @@ -55,9 +60,6 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, pgoff_t index; int r; - if (folio_test_uptodate(folio)) - return 0; - nr_pages = folio_nr_pages(folio); for (i = 0; i < nr_pages; i++) clear_highpage(folio_page(folio, i)); @@ -80,7 +82,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, index = ALIGN_DOWN(index, 1 << folio_order(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) - folio_mark_uptodate(folio); + kvm_gmem_mark_prepared(folio); return r; } @@ -551,7 +553,8 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) /* Returns a locked folio on success. */ static struct folio * __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) + gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, + int *max_order) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; @@ -582,6 +585,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, if (max_order) *max_order = 0; + *is_prepared = folio_test_uptodate(folio); return folio; } @@ -590,18 +594,21 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, { struct file *file = kvm_gmem_get_file(slot); struct folio *folio; + bool is_prepared = false; int r = 0; if (!file) return -EFAULT; - folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order); + folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order); if (IS_ERR(folio)) { r = PTR_ERR(folio); goto out; } - r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); + if (!is_prepared) + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); + folio_unlock(folio); if (r < 0) folio_put(folio); @@ -641,6 +648,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long for (i = 0; i < npages; i += (1 << max_order)) { struct folio *folio; gfn_t gfn = start_gfn + i; + bool is_prepared = false; kvm_pfn_t pfn; if (signal_pending(current)) { @@ -648,13 +656,13 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order); + folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; } - if (folio_test_uptodate(folio)) { + if (is_prepared) { folio_unlock(folio); folio_put(folio); ret = -EEXIST; @@ -662,9 +670,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long } folio_unlock(folio); - if (!IS_ALIGNED(gfn, (1 << max_order)) || - (npages - i) < (1 << max_order)) - max_order = 0; + WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || + (npages - i) < (1 << max_order)); ret = -EINVAL; while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), @@ -678,7 +685,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); if (!ret) - folio_mark_uptodate(folio); + kvm_gmem_mark_prepared(folio); put_folio_and_exit: folio_put(folio); From 81a91abab1307d7725fa4620952c0767beae7753 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 23 Jul 2024 14:45:08 +0800 Subject: [PATCH 0115/1052] irqchip/loongarch-cpu: Fix return value of lpic_gsi_to_irq() lpic_gsi_to_irq() should return a valid Linux interrupt number if acpi_register_gsi() succeeds, and return 0 otherwise. But lpic_gsi_to_irq() converts a negative return value of acpi_register_gsi() to a positive value silently. Convert the return value explicitly. Fixes: e8bba72b396c ("irqchip / ACPI: Introduce ACPI_IRQ_MODEL_LPIC for LoongArch") Reported-by: Miao Wang Signed-off-by: Huacai Chen Signed-off-by: Thomas Gleixner Reviewed-by: Jiaxun Yang Cc: Link: https://lore.kernel.org/r/20240723064508.35560-1-chenhuacai@loongson.cn --- drivers/irqchip/irq-loongarch-cpu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-loongarch-cpu.c b/drivers/irqchip/irq-loongarch-cpu.c index 9d8f2c406043..b35903a06902 100644 --- a/drivers/irqchip/irq-loongarch-cpu.c +++ b/drivers/irqchip/irq-loongarch-cpu.c @@ -18,11 +18,13 @@ struct fwnode_handle *cpuintc_handle; static u32 lpic_gsi_to_irq(u32 gsi) { + int irq = 0; + /* Only pch irqdomain transferring is required for LoongArch. */ if (gsi >= GSI_MIN_PCH_IRQ && gsi <= GSI_MAX_PCH_IRQ) - return acpi_register_gsi(NULL, gsi, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_HIGH); + irq = acpi_register_gsi(NULL, gsi, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_HIGH); - return 0; + return (irq > 0) ? irq : 0; } static struct fwnode_handle *lpic_get_gsi_domain_id(u32 gsi) From b34ce4a59cfe9cd0d6f870e6408e8ec88a964585 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 17 Jul 2024 22:03:32 +0200 Subject: [PATCH 0116/1052] power: supply: axp288_charger: Fix constant_charge_voltage writes info->max_cv is in millivolts, divide the microvolt value being written to constant_charge_voltage by 1000 *before* clamping it to info->max_cv. Before this fix the code always tried to set constant_charge_voltage to max_cv / 1000 = 4 millivolt, which ends up in setting it to 4.1V which is the lowest supported value. Fixes: 843735b788a4 ("power: axp288_charger: axp288 charger driver") Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240717200333.56669-1-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/axp288_charger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/axp288_charger.c b/drivers/power/supply/axp288_charger.c index b5903193e2f9..aea17289a178 100644 --- a/drivers/power/supply/axp288_charger.c +++ b/drivers/power/supply/axp288_charger.c @@ -337,8 +337,8 @@ static int axp288_charger_usb_set_property(struct power_supply *psy, } break; case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE: - scaled_val = min(val->intval, info->max_cv); - scaled_val = DIV_ROUND_CLOSEST(scaled_val, 1000); + scaled_val = DIV_ROUND_CLOSEST(val->intval, 1000); + scaled_val = min(scaled_val, info->max_cv); ret = axp288_charger_set_cv(info, scaled_val); if (ret < 0) { dev_warn(&info->pdev->dev, "set charge voltage failed\n"); From 81af7f2342d162e24ac820c10e68684d9f927663 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 17 Jul 2024 22:03:33 +0200 Subject: [PATCH 0117/1052] power: supply: axp288_charger: Round constant_charge_voltage writes down Round constant_charge_voltage writes down to the first supported lower value, rather then rounding them up to the first supported higher value. This fixes e.g. writing 4250000 resulting in a value of 4350000 which might be dangerous, instead writing 4250000 will now result in a safe 4200000 value. Fixes: 843735b788a4 ("power: axp288_charger: axp288 charger driver") Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240717200333.56669-2-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/axp288_charger.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/power/supply/axp288_charger.c b/drivers/power/supply/axp288_charger.c index aea17289a178..ac05942e4e6a 100644 --- a/drivers/power/supply/axp288_charger.c +++ b/drivers/power/supply/axp288_charger.c @@ -178,18 +178,18 @@ static inline int axp288_charger_set_cv(struct axp288_chrg_info *info, int cv) u8 reg_val; int ret; - if (cv <= CV_4100MV) { - reg_val = CHRG_CCCV_CV_4100MV; - cv = CV_4100MV; - } else if (cv <= CV_4150MV) { - reg_val = CHRG_CCCV_CV_4150MV; - cv = CV_4150MV; - } else if (cv <= CV_4200MV) { - reg_val = CHRG_CCCV_CV_4200MV; - cv = CV_4200MV; - } else { + if (cv >= CV_4350MV) { reg_val = CHRG_CCCV_CV_4350MV; cv = CV_4350MV; + } else if (cv >= CV_4200MV) { + reg_val = CHRG_CCCV_CV_4200MV; + cv = CV_4200MV; + } else if (cv >= CV_4150MV) { + reg_val = CHRG_CCCV_CV_4150MV; + cv = CV_4150MV; + } else { + reg_val = CHRG_CCCV_CV_4100MV; + cv = CV_4100MV; } reg_val = reg_val << CHRG_CCCV_CV_BIT_POS; From bf9d5cb588755ee41ac12a8976dccf44ae18281b Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 15 Jul 2024 14:57:06 +0200 Subject: [PATCH 0118/1052] power: supply: qcom_battmgr: return EAGAIN when firmware service is not up The driver returns -ENODEV when the firmware battmrg service hasn't started yet, while per-se -ENODEV is fine, we usually use -EAGAIN to tell the user to retry again later. And the power supply core uses -EGAIN when the device isn't initialized, let's use the same return. This notably causes an infinite spam of: thermal thermal_zoneXX: failed to read out thermal zone (-19) because the thermal core doesn't understand -ENODEV, but only considers -EAGAIN as a non-fatal error. While it didn't appear until now, commit [1] fixes thermal core and no more ignores thermal zones returning an error at first temperature update. [1] 5725f40698b9 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") Link: https://lore.kernel.org/all/2ed4c630-204a-4f80-a37f-f2ca838eb455@linaro.org/ Cc: stable@vger.kernel.org Fixes: 29e8142b5623 ("power: supply: Introduce Qualcomm PMIC GLINK power supply") Signed-off-by: Neil Armstrong Tested-by: Stephan Gerhold Reviewed-by: Stephan Gerhold Link: https://lore.kernel.org/r/20240715-topic-sm8x50-upstream-fix-battmgr-temp-tz-warn-v1-1-16e842ccead7@linaro.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index ec163d1bcd18..44c6301f5f17 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -486,7 +486,7 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, int ret; if (!battmgr->service_up) - return -ENODEV; + return -EAGAIN; if (battmgr->variant == QCOM_BATTMGR_SC8280XP) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); @@ -683,7 +683,7 @@ static int qcom_battmgr_ac_get_property(struct power_supply *psy, int ret; if (!battmgr->service_up) - return -ENODEV; + return -EAGAIN; ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); if (ret) @@ -748,7 +748,7 @@ static int qcom_battmgr_usb_get_property(struct power_supply *psy, int ret; if (!battmgr->service_up) - return -ENODEV; + return -EAGAIN; if (battmgr->variant == QCOM_BATTMGR_SC8280XP) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); @@ -867,7 +867,7 @@ static int qcom_battmgr_wls_get_property(struct power_supply *psy, int ret; if (!battmgr->service_up) - return -ENODEV; + return -EAGAIN; if (battmgr->variant == QCOM_BATTMGR_SC8280XP) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); From d6cca7631a4b54a8995e3bc53e5afb11d3b0c8ff Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Fri, 12 Jul 2024 12:00:03 +0200 Subject: [PATCH 0119/1052] power: supply: qcom_battmgr: Ignore extra __le32 in info payload Some newer ADSP firmware versions on X1E80100 report an extra __le32 at the end of the battery information request payload, causing qcom_battmgr to fail to initialize. Adjust the check to ignore the extra field in the info payload so we can support both old and newer firmware versions. Tested-by: Srinivas Kandagatla Signed-off-by: Stephan Gerhold Tested-by: Johan Hovold Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240712-x1e80100-battmgr-v1-1-a253d767f493@linaro.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 44c6301f5f17..a99937cd733b 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -1007,7 +1007,9 @@ static void qcom_battmgr_sc8280xp_callback(struct qcom_battmgr *battmgr, battmgr->error = 0; break; case BATTMGR_BAT_INFO: - if (payload_len != sizeof(resp->info)) { + /* some firmware versions report an extra __le32 at the end of the payload */ + if (payload_len != sizeof(resp->info) && + payload_len != (sizeof(resp->info) + sizeof(__le32))) { dev_warn(battmgr->dev, "invalid payload length for battery information request: %zd\n", payload_len); From 697943657444a7d7123b47bc32019e62533f4863 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 25 Jul 2024 10:03:54 -0700 Subject: [PATCH 0120/1052] fbnic: Change kconfig prompt from S390=n to !S390 In testing the recent kernel I found that the fbnic driver couldn't be enabled on x86_64 builds. A bit of digging showed that the fbnic driver was the only one to check for S390 to be n, all others had checked for !S390. Since it is a boolean and not a tristate I am not sure it will be N. So just update it to use the !S390 flag. A quick check via "make menuconfig" verified that after making this change there was an option to select the fbnic driver. Fixes 0e03c643dc93 ("eth: fbnic: fix s390 build.") Signed-off-by: Alexander Duyck Reviewed-by: Joe Damato Link: https://patch.msgid.link/172192698293.1903337.4255690118685300353.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/Kconfig b/drivers/net/ethernet/meta/Kconfig index 86034ea4ba5b..c002ede36402 100644 --- a/drivers/net/ethernet/meta/Kconfig +++ b/drivers/net/ethernet/meta/Kconfig @@ -20,7 +20,7 @@ if NET_VENDOR_META config FBNIC tristate "Meta Platforms Host Network Interface" depends on X86_64 || COMPILE_TEST - depends on S390=n + depends on !S390 depends on MAX_SKB_FRAGS < 22 depends on PCI_MSI select PHYLINK From 01aa8c869d0cdaf603f42dc1d2302b164c25353a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 27 Jul 2024 16:58:24 +0100 Subject: [PATCH 0121/1052] blk-throttle: remove more latency dead-code The struct 'latency_bucket' and the #define 'request_bucket_index' are unused since commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") and the 'LATENCY_BUCKET_SIZE' #define was only used by the 'request_bucket_index' define. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20240727155824.1000042-1-linux@treblig.org Signed-off-by: Jens Axboe --- block/blk-throttle.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index dc6140fa3de0..6943ec720f39 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -31,14 +31,6 @@ static struct workqueue_struct *kthrotld_workqueue; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* We measure latency for request size from <= 4k to >= 1M */ -#define LATENCY_BUCKET_SIZE 9 - -struct latency_bucket { - unsigned long total_latency; /* ns / 1024 */ - int samples; -}; - struct throtl_data { /* service tree for active throtl groups */ @@ -116,9 +108,6 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) return tg->iops[rw]; } -#define request_bucket_index(sectors) \ - clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) - /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported From fe26546aeb35c5d1fd69530bb6bfd04e0b5cd489 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Thu, 4 Jul 2024 08:14:15 +0800 Subject: [PATCH 0122/1052] drm/amdgpu/pm: support gpu_metrics sysfs interface for smu v14.0.2/3 support gpu_metrics sysfs interface for smu v14.0.2/3 Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 60c30ba7ba2064066ec462236666058cbbf619c1) --- .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 86 ++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 98ea58d792ca..e1a27903c80a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -66,6 +66,7 @@ #define MP0_MP1_DATA_REGION_SIZE_COMBOPPTABLE 0x4000 #define DEBUGSMC_MSG_Mode1Reset 2 +#define LINK_SPEED_MAX 3 static struct cmn2asic_msg_mapping smu_v14_0_2_message_map[SMU_MSG_MAX_COUNT] = { MSG_MAP(TestMessage, PPSMC_MSG_TestMessage, 1), @@ -221,7 +222,6 @@ static struct cmn2asic_mapping smu_v14_0_2_workload_map[PP_SMC_POWER_PROFILE_COU WORKLOAD_MAP(PP_SMC_POWER_PROFILE_WINDOW3D, WORKLOAD_PPLIB_WINDOW_3D_BIT), }; -#if 0 static const uint8_t smu_v14_0_2_throttler_map[] = { [THROTTLER_PPT0_BIT] = (SMU_THROTTLER_PPT0_BIT), [THROTTLER_PPT1_BIT] = (SMU_THROTTLER_PPT1_BIT), @@ -241,7 +241,6 @@ static const uint8_t smu_v14_0_2_throttler_map[] = { [THROTTLER_GFX_APCC_PLUS_BIT] = (SMU_THROTTLER_APCC_BIT), [THROTTLER_FIT_BIT] = (SMU_THROTTLER_FIT_BIT), }; -#endif static int smu_v14_0_2_get_allowed_feature_mask(struct smu_context *smu, @@ -1869,6 +1868,88 @@ static ssize_t smu_v14_0_2_get_ecc_info(struct smu_context *smu, return ret; } +static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu, + void **table) +{ + struct smu_table_context *smu_table = &smu->smu_table; + struct gpu_metrics_v1_3 *gpu_metrics = + (struct gpu_metrics_v1_3 *)smu_table->gpu_metrics_table; + SmuMetricsExternal_t metrics_ext; + SmuMetrics_t *metrics = &metrics_ext.SmuMetrics; + int ret = 0; + + ret = smu_cmn_get_metrics_table(smu, + &metrics_ext, + true); + if (ret) + return ret; + + smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 3); + + gpu_metrics->temperature_edge = metrics->AvgTemperature[TEMP_EDGE]; + gpu_metrics->temperature_hotspot = metrics->AvgTemperature[TEMP_HOTSPOT]; + gpu_metrics->temperature_mem = metrics->AvgTemperature[TEMP_MEM]; + gpu_metrics->temperature_vrgfx = metrics->AvgTemperature[TEMP_VR_GFX]; + gpu_metrics->temperature_vrsoc = metrics->AvgTemperature[TEMP_VR_SOC]; + gpu_metrics->temperature_vrmem = max(metrics->AvgTemperature[TEMP_VR_MEM0], + metrics->AvgTemperature[TEMP_VR_MEM1]); + + gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity; + gpu_metrics->average_umc_activity = metrics->AverageUclkActivity; + gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage, + metrics->Vcn1ActivityPercentage); + + gpu_metrics->average_socket_power = metrics->AverageSocketPower; + gpu_metrics->energy_accumulator = metrics->EnergyAccumulator; + + if (metrics->AverageGfxActivity <= SMU_14_0_2_BUSY_THRESHOLD) + gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPostDs; + else + gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs; + + if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD) + gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs; + else + gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs; + + gpu_metrics->average_vclk0_frequency = metrics->AverageVclk0Frequency; + gpu_metrics->average_dclk0_frequency = metrics->AverageDclk0Frequency; + gpu_metrics->average_vclk1_frequency = metrics->AverageVclk1Frequency; + gpu_metrics->average_dclk1_frequency = metrics->AverageDclk1Frequency; + + gpu_metrics->current_gfxclk = gpu_metrics->average_gfxclk_frequency; + gpu_metrics->current_socclk = metrics->CurrClock[PPCLK_SOCCLK]; + gpu_metrics->current_uclk = metrics->CurrClock[PPCLK_UCLK]; + gpu_metrics->current_vclk0 = metrics->CurrClock[PPCLK_VCLK_0]; + gpu_metrics->current_dclk0 = metrics->CurrClock[PPCLK_DCLK_0]; + gpu_metrics->current_vclk1 = metrics->CurrClock[PPCLK_VCLK_0]; + gpu_metrics->current_dclk1 = metrics->CurrClock[PPCLK_DCLK_0]; + + gpu_metrics->throttle_status = + smu_v14_0_2_get_throttler_status(metrics); + gpu_metrics->indep_throttle_status = + smu_cmn_get_indep_throttler_status(gpu_metrics->throttle_status, + smu_v14_0_2_throttler_map); + + gpu_metrics->current_fan_speed = metrics->AvgFanRpm; + + gpu_metrics->pcie_link_width = metrics->PcieWidth; + if ((metrics->PcieRate - 1) > LINK_SPEED_MAX) + gpu_metrics->pcie_link_speed = pcie_gen_to_speed(1); + else + gpu_metrics->pcie_link_speed = pcie_gen_to_speed(metrics->PcieRate); + + gpu_metrics->system_clock_counter = ktime_get_boottime_ns(); + + gpu_metrics->voltage_gfx = metrics->AvgVoltage[SVI_PLANE_VDD_GFX]; + gpu_metrics->voltage_soc = metrics->AvgVoltage[SVI_PLANE_VDD_SOC]; + gpu_metrics->voltage_mem = metrics->AvgVoltage[SVI_PLANE_VDDIO_MEM]; + + *table = (void *)gpu_metrics; + + return sizeof(struct gpu_metrics_v1_3); +} + static const struct pptable_funcs smu_v14_0_2_ppt_funcs = { .get_allowed_feature_mask = smu_v14_0_2_get_allowed_feature_mask, .set_default_dpm_table = smu_v14_0_2_set_default_dpm_table, @@ -1905,6 +1986,7 @@ static const struct pptable_funcs smu_v14_0_2_ppt_funcs = { .enable_thermal_alert = smu_v14_0_enable_thermal_alert, .disable_thermal_alert = smu_v14_0_disable_thermal_alert, .notify_memory_pool_location = smu_v14_0_notify_memory_pool_location, + .get_gpu_metrics = smu_v14_0_2_get_gpu_metrics, .set_soft_freq_limited_range = smu_v14_0_set_soft_freq_limited_range, .init_pptable_microcode = smu_v14_0_init_pptable_microcode, .populate_umd_state_clk = smu_v14_0_2_populate_umd_state_clk, From f3572db3c049b4d32bb5ba77ad5305616c44c7c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 24 Jul 2024 09:24:02 +0200 Subject: [PATCH 0123/1052] drm/amdgpu: fix contiguous handling for IB parsing v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise we won't get correct access to the IB. v2: keep setting AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS to avoid problems in the VRAM backend. Signed-off-by: Christian König Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3501 Fixes: e362b7c8f8c7 ("drm/amdgpu: Modify the contiguous flags behaviour") Reviewed-by: Alex Deucher Cc: stable@vger.kernel.org Tested-by: Dave Airlie Signed-off-by: Alex Deucher (cherry picked from commit fbfb5f0342253d92c4e446588c428a9d90c3f610) --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 916b6b8cf7d9..9aa952f258cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1778,7 +1778,7 @@ int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, struct ttm_operation_ctx ctx = { false, false }; struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_va_mapping *mapping; - int r; + int i, r; addr /= AMDGPU_GPU_PAGE_SIZE; @@ -1793,13 +1793,13 @@ int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, if (dma_resv_locking_ctx((*bo)->tbo.base.resv) != &parser->exec.ticket) return -EINVAL; - if (!((*bo)->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)) { - (*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; - amdgpu_bo_placement_from_domain(*bo, (*bo)->allowed_domains); - r = ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, &ctx); - if (r) - return r; - } + (*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; + amdgpu_bo_placement_from_domain(*bo, (*bo)->allowed_domains); + for (i = 0; i < (*bo)->placement.num_placement; i++) + (*bo)->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; + r = ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, &ctx); + if (r) + return r; return amdgpu_ttm_alloc_gart(&(*bo)->tbo); } From 9038e25c80558d48ce33d6d8c168666164dc72e9 Mon Sep 17 00:00:00 2001 From: Michael Chen Date: Tue, 23 Jul 2024 17:45:23 -0400 Subject: [PATCH 0124/1052] drm/amdgpu: increase mes log buffer size for gfx12 MES firmware requires larger log buffer for gfx12. Allocate proper buffer respectively for gfx11 and gfx12. Signed-off-by: Michael Chen Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 739d0f3e1f36738d4cd84166784a8f7a58d69612) --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 6 +++--- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 8 ++++++-- drivers/gpu/drm/amd/include/mes_v11_api_def.h | 3 +++ drivers/gpu/drm/amd/include/mes_v12_api_def.h | 3 +++ 6 files changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index e499d6ba306b..dac88d2dd70d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -103,7 +103,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) if (!amdgpu_mes_log_enable) return 0; - r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE, + r = amdgpu_bo_create_kernel(adev, adev->mes.event_log_size, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, &adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, @@ -113,7 +113,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) return r; } - memset(adev->mes.event_log_cpu_addr, 0, PAGE_SIZE); + memset(adev->mes.event_log_cpu_addr, 0, adev->mes.event_log_size); return 0; @@ -1573,7 +1573,7 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr); seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4, - mem, AMDGPU_MES_LOG_BUFFER_SIZE, false); + mem, adev->mes.event_log_size, false); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index e11051271f71..2d659c612f03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -52,7 +52,6 @@ enum amdgpu_mes_priority_level { #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */ #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */ -#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */ struct amdgpu_mes_funcs; @@ -135,8 +134,9 @@ struct amdgpu_mes { unsigned long *doorbell_bitmap; /* MES event log buffer */ - struct amdgpu_bo *event_log_gpu_obj; - uint64_t event_log_gpu_addr; + uint32_t event_log_size; + struct amdgpu_bo *event_log_gpu_obj; + uint64_t event_log_gpu_addr; void *event_log_cpu_addr; /* ip specific functions */ diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 8ce51b9236c1..f9343642ae7e 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1163,6 +1163,8 @@ static int mes_v11_0_sw_init(void *handle) adev->mes.kiq_hw_init = &mes_v11_0_kiq_hw_init; adev->mes.kiq_hw_fini = &mes_v11_0_kiq_hw_fini; + adev->mes.event_log_size = AMDGPU_MES_LOG_BUFFER_SIZE; + r = amdgpu_mes_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index c9f74231ad59..0713bc3eb263 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -551,8 +551,10 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.oversubscription_timer = 50; mes_set_hw_res_pkt.unmapped_doorbell_handling = 1; - mes_set_hw_res_pkt.enable_mes_event_int_logging = 0; - mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; + if (amdgpu_mes_log_enable) { + mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; + mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; + } return mes_v12_0_submit_pkt_and_poll_completion(mes, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), @@ -1237,6 +1239,8 @@ static int mes_v12_0_sw_init(void *handle) adev->mes.kiq_hw_init = &mes_v12_0_kiq_hw_init; adev->mes.kiq_hw_fini = &mes_v12_0_kiq_hw_fini; + adev->mes.event_log_size = AMDGPU_MES_LOG_BUFFER_SIZE; + r = amdgpu_mes_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h index b72d5d362251..21ceafce1f9b 100644 --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h @@ -28,6 +28,9 @@ #define MES_API_VERSION 1 +/* Maximum log buffer size for MES. Needs to be updated if MES expands MES_EVT_INTR_HIST_LOG */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 + /* Driver submits one API(cmd) as a single Frame and this command size is same * for all API to ease the debugging and parsing of ring buffer. */ diff --git a/drivers/gpu/drm/amd/include/mes_v12_api_def.h b/drivers/gpu/drm/amd/include/mes_v12_api_def.h index ffd67c6ed9b3..4cf2c9f30b3d 100644 --- a/drivers/gpu/drm/amd/include/mes_v12_api_def.h +++ b/drivers/gpu/drm/amd/include/mes_v12_api_def.h @@ -28,6 +28,9 @@ #define MES_API_VERSION 0x14 +/* Maximum log buffer size for MES. Needs to be updated if MES expands MES_EVT_INTR_HIST_LOG_12 */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0xC000 + /* Driver submits one API(cmd) as a single Frame and this command size is same for all API * to ease the debugging and parsing of ring buffer. */ From d2860084ecca456ce78b251011f7def8d9136dcc Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 25 Jul 2024 17:30:37 -0400 Subject: [PATCH 0125/1052] drm/amdgpu: Fix APU handling in amdgpu_pm_load_smu_firmware() We only need to skip this on modern APUs. It's required on older APUs as it's where start_smu gets called from. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3502 Fixes: 064d92436b69 ("drm/amd/pm: avoid to load smu firmware for APUs") Reviewed-by: Tim Huang Signed-off-by: Alex Deucher Cc: Tim Huang (cherry picked from commit 608d886c978cd5f3d8650630568d96c231845227) --- drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index a1b8a82d77cf..8b7d6ed7e2ed 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -618,7 +618,8 @@ int amdgpu_pm_load_smu_firmware(struct amdgpu_device *adev, uint32_t *smu_versio const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; int r = 0; - if (!pp_funcs || !pp_funcs->load_firmware || adev->flags & AMD_IS_APU) + if (!pp_funcs || !pp_funcs->load_firmware || + (is_support_sw_smu(adev) && (adev->flags & AMD_IS_APU))) return 0; mutex_lock(&adev->pm.mutex); From 5ce86c6c861352c9346ebb5c96ed70cb67414aa3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 27 Jul 2024 23:02:59 +0900 Subject: [PATCH 0126/1052] rust: suppress error messages from CONFIG_{RUSTC,BINDGEN}_VERSION_TEXT While this is a somewhat unusual case, I encountered odd error messages when I ran Kconfig in a foreign architecture chroot. $ make allmodconfig sh: 1: rustc: not found sh: 1: bindgen: not found # # configuration written to .config # The successful execution of 'command -v rustc' does not necessarily mean that 'rustc --version' will succeed. $ sh -c 'command -v rustc' /home/masahiro/.cargo/bin/rustc $ sh -c 'rustc --version' sh: 1: rustc: not found Here, 'rustc' is built for x86, and I ran it in an arm64 system. The current code: command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n can be turned into: command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version 2>/dev/null || echo n However, I did not understand the necessity of 'command -v $(RUSTC)'. I simplified it to: $(RUSTC) --version 2>/dev/null || echo n Fixes: 2f7ab1267dc9 ("Kbuild: add Rust support") Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240727140302.1806011-1-masahiroy@kernel.org [ Rebased on top of v6.11-rc1. - Miguel ] Signed-off-by: Miguel Ojeda --- init/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index a465ea9525bd..cddeec9fcb72 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1919,7 +1919,7 @@ config RUST config RUSTC_VERSION_TEXT string depends on RUST - default $(shell,command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n) + default $(shell,$(RUSTC) --version 2>/dev/null || echo n) config BINDGEN_VERSION_TEXT string @@ -1927,7 +1927,7 @@ config BINDGEN_VERSION_TEXT # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 # (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when # the minimum version is upgraded past that (0.69.1 already fixed the issue). - default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version workaround-for-0.69.0 || echo n) + default $(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null || echo n) # # Place an empty function call at each tracepoint site. Can be From aacf93e87f0d808ef46e621aa56caea336b4433c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 27 Jul 2024 23:03:00 +0900 Subject: [PATCH 0127/1052] rust: fix the default format for CONFIG_{RUSTC,BINDGEN}_VERSION_TEXT Another oddity in these config entries is their default value can fall back to 'n', which is a value for bool or tristate symbols. The '|| echo n' is an incorrect workaround to avoid the syntax error. This is not a big deal, as the entry is hidden by 'depends on RUST' in situations where '$(RUSTC) --version' or '$(BINDGEN) --version' fails. Anyway, it looks odd. The default of a string type symbol should be a double-quoted string literal. Turn it into an empty string when the version command fails. Fixes: 2f7ab1267dc9 ("Kbuild: add Rust support") Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240727140302.1806011-2-masahiroy@kernel.org [ Rebased on top of v6.11-rc1. - Miguel ] Signed-off-by: Miguel Ojeda --- init/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index cddeec9fcb72..3ada33b1d681 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1919,7 +1919,7 @@ config RUST config RUSTC_VERSION_TEXT string depends on RUST - default $(shell,$(RUSTC) --version 2>/dev/null || echo n) + default "$(shell,$(RUSTC) --version 2>/dev/null)" config BINDGEN_VERSION_TEXT string @@ -1927,7 +1927,7 @@ config BINDGEN_VERSION_TEXT # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 # (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when # the minimum version is upgraded past that (0.69.1 already fixed the issue). - default $(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null || echo n) + default "$(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null)" # # Place an empty function call at each tracepoint site. Can be From 1a251f52cfdc417c84411a056bc142cbd77baef4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 15:49:18 -0700 Subject: [PATCH 0128/1052] minmax: make generic MIN() and MAX() macros available everywhere This just standardizes the use of MIN() and MAX() macros, with the very traditional semantics. The goal is to use these for C constant expressions and for top-level / static initializers, and so be able to simplify the min()/max() macros. These macro names were used by various kernel code - they are very traditional, after all - and all such users have been fixed up, with a few different approaches: - trivial duplicated macro definitions have been removed Note that 'trivial' here means that it's obviously kernel code that already included all the major kernel headers, and thus gets the new generic MIN/MAX macros automatically. - non-trivial duplicated macro definitions are guarded with #ifndef This is the "yes, they define their own versions, but no, the include situation is not entirely obvious, and maybe they don't get the generic version automatically" case. - strange use case #1 A couple of drivers decided that the way they want to describe their versioning is with #define MAJ 1 #define MIN 2 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) which adds zero value and I just did my Alexander the Great impersonation, and rewrote that pointless Gordian knot as #define DRV_VERSION "1.2" instead. - strange use case #2 A couple of drivers thought that it's a good idea to have a random 'MIN' or 'MAX' define for a value or index into a table, rather than the traditional macro that takes arguments. These values were re-written as C enum's instead. The new function-line macros only expand when followed by an open parenthesis, and thus don't clash with enum use. Happily, there weren't really all that many of these cases, and a lot of users already had the pattern of using '#ifndef' guarding (or in one case just using '#undef MIN') before defining their own private version that does the same thing. I left such cases alone. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- arch/um/drivers/mconsole_user.c | 2 ++ drivers/edac/skx_common.h | 1 - .../gpu/drm/amd/display/dc/core/dc_stream.c | 2 ++ .../drm/amd/display/modules/hdcp/hdcp_ddc.c | 2 ++ .../drm/amd/pm/powerplay/hwmgr/ppevvmath.h | 14 +++++++---- drivers/gpu/drm/radeon/evergreen_cs.c | 2 ++ drivers/hwmon/adt7475.c | 24 +++++++++---------- drivers/media/dvb-frontends/stv0367_priv.h | 3 +++ drivers/net/fjes/fjes_main.c | 4 +--- drivers/nfc/pn544/i2c.c | 2 -- drivers/platform/x86/sony-laptop.c | 1 - drivers/scsi/isci/init.c | 6 +---- .../pci/hive_isp_css_include/math_support.h | 5 ---- include/linux/minmax.h | 2 ++ kernel/trace/preemptirq_delay_test.c | 2 -- lib/btree.c | 1 - lib/decompress_unlzma.c | 2 ++ mm/zsmalloc.c | 2 -- tools/testing/selftests/mm/mremap_test.c | 2 ++ tools/testing/selftests/seccomp/seccomp_bpf.c | 2 ++ 20 files changed, 43 insertions(+), 38 deletions(-) diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c index e24298a734be..a04cd13c6315 100644 --- a/arch/um/drivers/mconsole_user.c +++ b/arch/um/drivers/mconsole_user.c @@ -71,7 +71,9 @@ static struct mconsole_command *mconsole_parse(struct mc_request *req) return NULL; } +#ifndef MIN #define MIN(a,b) ((a)<(b) ? (a):(b)) +#endif #define STRINGX(x) #x #define STRING(x) STRINGX(x) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 11faf1db4fa4..473421ba7a18 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -45,7 +45,6 @@ #define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) #define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) #define NUM_CHANNELS MAX(SKX_NUM_CHANNELS, I10NM_NUM_CHANNELS) #define NUM_DIMMS MAX(SKX_NUM_DIMMS, I10NM_NUM_DIMMS) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c index be2638c763d7..9a406d74c0dd 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c @@ -35,8 +35,10 @@ #include "dc_stream_priv.h" #define DC_LOGGER dc->ctx->logger +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) #define MAX(x, y) ((x > y) ? x : y) +#endif /******************************************************************************* * Private functions diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c index 7ecf76aea950..6e064e6ae949 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c @@ -25,7 +25,9 @@ #include "hdcp.h" +#ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif #define HDCP_I2C_ADDR 0x3a /* 0x74 >> 1*/ #define KSV_READ_SIZE 0xf /* 0x6803b - 0x6802c */ #define HDCP_MAX_AUX_TRANSACTION_SIZE 16 diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h index 6f54c410c2f9..409aeec6baa9 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h @@ -22,12 +22,18 @@ */ #include -#define SHIFT_AMOUNT 16 /* We multiply all original integers with 2^SHIFT_AMOUNT to get the fInt representation */ +enum ppevvmath_constants { + /* We multiply all original integers with 2^SHIFT_AMOUNT to get the fInt representation */ + SHIFT_AMOUNT = 16, -#define PRECISION 5 /* Change this value to change the number of decimal places in the final output - 5 is a good default */ + /* Change this value to change the number of decimal places in the final output - 5 is a good default */ + PRECISION = 5, -#define SHIFTED_2 (2 << SHIFT_AMOUNT) -#define MAX (1 << (SHIFT_AMOUNT - 1)) - 1 /* 32767 - Might change in the future */ + SHIFTED_2 = (2 << SHIFT_AMOUNT), + + /* 32767 - Might change in the future */ + MAX = (1 << (SHIFT_AMOUNT - 1)) - 1, +}; /* ------------------------------------------------------------------------------- * NEW TYPE - fINT diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c index 1fe6e0d883c7..e5577d2a19ef 100644 --- a/drivers/gpu/drm/radeon/evergreen_cs.c +++ b/drivers/gpu/drm/radeon/evergreen_cs.c @@ -33,8 +33,10 @@ #include "evergreen_reg_safe.h" #include "cayman_reg_safe.h" +#ifndef MIN #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#endif #define REG_SAFE_BM_SIZE ARRAY_SIZE(evergreen_reg_safe_bm) diff --git a/drivers/hwmon/adt7475.c b/drivers/hwmon/adt7475.c index bc186c61a2c0..382a2bb9168a 100644 --- a/drivers/hwmon/adt7475.c +++ b/drivers/hwmon/adt7475.c @@ -22,23 +22,23 @@ #include /* Indexes for the sysfs hooks */ - -#define INPUT 0 -#define MIN 1 -#define MAX 2 -#define CONTROL 3 -#define OFFSET 3 -#define AUTOMIN 4 -#define THERM 5 -#define HYSTERSIS 6 - +enum adt_sysfs_id { + INPUT = 0, + MIN = 1, + MAX = 2, + CONTROL = 3, + OFFSET = 3, // Dup + AUTOMIN = 4, + THERM = 5, + HYSTERSIS = 6, /* * These are unique identifiers for the sysfs functions - unlike the * numbers above, these are not also indexes into an array */ + ALARM = 9, + FAULT = 10, +}; -#define ALARM 9 -#define FAULT 10 /* 7475 Common Registers */ diff --git a/drivers/media/dvb-frontends/stv0367_priv.h b/drivers/media/dvb-frontends/stv0367_priv.h index 617f605947b2..7f056d1cce82 100644 --- a/drivers/media/dvb-frontends/stv0367_priv.h +++ b/drivers/media/dvb-frontends/stv0367_priv.h @@ -25,8 +25,11 @@ #endif /* MACRO definitions */ +#ifndef MIN #define MAX(X, Y) ((X) >= (Y) ? (X) : (Y)) #define MIN(X, Y) ((X) <= (Y) ? (X) : (Y)) +#endif + #define INRANGE(X, Y, Z) \ ((((X) <= (Y)) && ((Y) <= (Z))) || \ (((Z) <= (Y)) && ((Y) <= (X))) ? 1 : 0) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index b3ddc9a629d9..fad5b6564464 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -14,9 +14,7 @@ #include "fjes.h" #include "fjes_trace.h" -#define MAJ 1 -#define MIN 2 -#define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) +#define DRV_VERSION "1.2" #define DRV_NAME "fjes" char fjes_driver_name[] = DRV_NAME; char fjes_driver_version[] = DRV_VERSION; diff --git a/drivers/nfc/pn544/i2c.c b/drivers/nfc/pn544/i2c.c index 9fe664960b38..e2a6575b9ff7 100644 --- a/drivers/nfc/pn544/i2c.c +++ b/drivers/nfc/pn544/i2c.c @@ -126,8 +126,6 @@ struct pn544_i2c_fw_secure_blob { #define PN544_FW_CMD_RESULT_COMMAND_REJECTED 0xE0 #define PN544_FW_CMD_RESULT_CHUNK_ERROR 0xE6 -#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) - #define PN544_FW_WRITE_BUFFER_MAX_LEN 0x9f7 #define PN544_FW_I2C_MAX_PAYLOAD PN544_HCI_I2C_LLC_MAX_SIZE #define PN544_FW_I2C_WRITE_FRAME_HEADER_LEN 8 diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 3e94fdd1ea52..3197aaa69da7 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -757,7 +757,6 @@ static union acpi_object *__call_snc_method(acpi_handle handle, char *method, return result; } -#define MIN(a, b) (a > b ? b : a) static int sony_nc_buffer_call(acpi_handle handle, char *name, u64 *value, void *buffer, size_t buflen) { diff --git a/drivers/scsi/isci/init.c b/drivers/scsi/isci/init.c index d31884f82f2a..73085d2f5c43 100644 --- a/drivers/scsi/isci/init.c +++ b/drivers/scsi/isci/init.c @@ -65,11 +65,7 @@ #include "task.h" #include "probe_roms.h" -#define MAJ 1 -#define MIN 2 -#define BUILD 0 -#define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \ - __stringify(BUILD) +#define DRV_VERSION "1.2.0" MODULE_VERSION(DRV_VERSION); diff --git a/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h b/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h index 7349943bba2b..160c496784b7 100644 --- a/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h +++ b/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h @@ -22,11 +22,6 @@ /* force a value to a lower even value */ #define EVEN_FLOOR(x) ((x) & ~1) -/* for preprocessor and array sizing use MIN and MAX - otherwise use min and max */ -#define MAX(a, b) (((a) > (b)) ? (a) : (b)) -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - #define CEIL_DIV(a, b) (((b) != 0) ? ((a) + (b) - 1) / (b) : 0) #define CEIL_MUL(a, b) (CEIL_DIV(a, b) * (b)) #define CEIL_MUL2(a, b) (((a) + (b) - 1) & ~((b) - 1)) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 9c2848abc804..fc384714da45 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -277,6 +277,8 @@ static inline bool in_range32(u32 val, u32 start, u32 len) * Use these carefully: no type checking, and uses the arguments * multiple times. Use for obvious constants only. */ +#define MIN(a,b) __cmp(min,a,b) +#define MAX(a,b) __cmp(max,a,b) #define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) #define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index cb0871fbdb07..314ffc143039 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) - static void busy_wait(ulong time) { u64 start, end; diff --git a/lib/btree.c b/lib/btree.c index 49420cae3a83..bb81d3393ac5 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -43,7 +43,6 @@ #include #include -#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NODESIZE MAX(L1_CACHE_BYTES, 128) struct btree_geo { diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c index 20a858031f12..9d34d35908da 100644 --- a/lib/decompress_unlzma.c +++ b/lib/decompress_unlzma.c @@ -37,7 +37,9 @@ #include +#ifndef MIN #define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#endif static long long INIT read_int(unsigned char *ptr, int size) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d6581ab7c07..2d3163e4da96 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -120,8 +120,6 @@ #define CLASS_BITS 8 #define MAGIC_VAL_BITS 8 -#define MAX(a, b) ((a) >= (b) ? (a) : (b)) - #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 1b03bcfaefdf..5a3a9bcba640 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -22,8 +22,10 @@ #define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) +#endif #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) #define SIZE_KB(k) ((size_t)k * 1024) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e3f97f90d8db..8c3a73461475 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -60,7 +60,9 @@ #define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) #endif +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#endif #ifndef PR_SET_PTRACER # define PR_SET_PTRACER 0x59616d61 From 89fb4dfacbb33750e0243848f4c54d73e558ba9a Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Sat, 20 Jul 2024 22:57:05 +0200 Subject: [PATCH 0129/1052] dt-bindings: ata: rockchip-dwc-ahci: add missing power-domains The Rockchip variant of the dwc-ahci controller does have and need power- domains to work, though the binding does not mention them, making dtccheck quite unhappy: DTC_CHK arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dtb /home/devel/hstuebner/00_git-repos/linux-rockchip/_build-arm64/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dtb: sata@fc800000: Unevaluated properties are not allowed ('power-domains' was unexpected) from schema $id: http://devicetree.org/schemas/ata/rockchip,dwc-ahci.yaml# Fix that by adding the missing power-domain property to the binding. Fixes: 85b0e13b19c2 ("dt-bindings: ata: dwc-ahci: add Rockchip RK3588") Signed-off-by: Heiko Stuebner Acked-by: Krzysztof Kozlowski Signed-off-by: Damien Le Moal --- Documentation/devicetree/bindings/ata/rockchip,dwc-ahci.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/ata/rockchip,dwc-ahci.yaml b/Documentation/devicetree/bindings/ata/rockchip,dwc-ahci.yaml index b5e5767d8698..13eaa8d9a16e 100644 --- a/Documentation/devicetree/bindings/ata/rockchip,dwc-ahci.yaml +++ b/Documentation/devicetree/bindings/ata/rockchip,dwc-ahci.yaml @@ -35,6 +35,9 @@ properties: ports-implemented: const: 1 + power-domains: + maxItems: 1 + sata-port@0: $ref: /schemas/ata/snps,dwc-ahci-common.yaml#/$defs/dwc-ahci-port From 9f499b8c791d2983c0a31a543c51d1b2f15e8755 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 17:06:20 -0700 Subject: [PATCH 0130/1052] minmax: scsi: fix mis-use of 'clamp()' in sr.c While working on simplifying the minmax functions, and avoiding excessive macro expansion, it turns out that the sr.c use of the 'clamp()' macro has the arguments the wrong way around. The clamp logic is val = clamp(in, low, high); and it returns the input clamped to the low/high limits. But sr.c ddid speed = clamp(0, speed, 0xffff / 177); which clamps the value '0' to the range '[speed, 0xffff / 177]' and ends up being nonsensical. Happily, I don't think anybody ever cared. Fixes: 9fad9d560af5 ("scsi: sr: Fix unintentional arithmetic wraparound") Cc: Justin Stitt Cc: Kees Cook Cc: Martin K. Petersen Signed-off-by: Linus Torvalds --- drivers/scsi/sr_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index a0d2556a27bb..089653018d32 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -431,7 +431,7 @@ int sr_select_speed(struct cdrom_device_info *cdi, unsigned long speed) struct packet_command cgc; /* avoid exceeding the max speed or overflowing integer bounds */ - speed = clamp(0, speed, 0xffff / 177); + speed = clamp(speed, 0, 0xffff / 177); if (speed == 0) speed = 0xffff; /* set to max */ From cb04e8b1d2f24c4c2c92f7b7529031fc35a16fed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 17:32:05 -0700 Subject: [PATCH 0131/1052] minmax: don't use max() in situations that want a C constant expression We only had a couple of array[] declarations, and changing them to just use 'MAX()' instead of 'max()' fixes the issue. This will allow us to simplify our min/max macros enormously, since they can now unconditionally use temporary variables to avoid using the argument values multiple times. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 2 +- drivers/input/touchscreen/cyttsp4_core.c | 2 +- drivers/irqchip/irq-sun6i-r.c | 2 +- drivers/net/can/usb/etas_es58x/es58x_devlink.c | 2 +- fs/btrfs/tree-checker.c | 2 +- lib/vsprintf.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index 88eefef05fae..91ad434bcdae 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -794,7 +794,7 @@ static const char *smu_get_feature_name(struct smu_context *smu, size_t smu_cmn_get_pp_feature_mask(struct smu_context *smu, char *buf) { - int8_t sort_feature[max(SMU_FEATURE_COUNT, SMU_FEATURE_MAX)]; + int8_t sort_feature[MAX(SMU_FEATURE_COUNT, SMU_FEATURE_MAX)]; uint64_t feature_mask; int i, feature_index; uint32_t count = 0; diff --git a/drivers/input/touchscreen/cyttsp4_core.c b/drivers/input/touchscreen/cyttsp4_core.c index 7cb26929dc73..9dc25eb2be44 100644 --- a/drivers/input/touchscreen/cyttsp4_core.c +++ b/drivers/input/touchscreen/cyttsp4_core.c @@ -871,7 +871,7 @@ static void cyttsp4_get_mt_touches(struct cyttsp4_mt_data *md, int num_cur_tch) struct cyttsp4_touch tch; int sig; int i, j, t = 0; - int ids[max(CY_TMA1036_MAX_TCH, CY_TMA4XX_MAX_TCH)]; + int ids[MAX(CY_TMA1036_MAX_TCH, CY_TMA4XX_MAX_TCH)]; memset(ids, 0, si->si_ofs.tch_abs[CY_TCH_T].max * sizeof(int)); for (i = 0; i < num_cur_tch; i++) { diff --git a/drivers/irqchip/irq-sun6i-r.c b/drivers/irqchip/irq-sun6i-r.c index a01e44049415..99958d470d62 100644 --- a/drivers/irqchip/irq-sun6i-r.c +++ b/drivers/irqchip/irq-sun6i-r.c @@ -270,7 +270,7 @@ static const struct irq_domain_ops sun6i_r_intc_domain_ops = { static int sun6i_r_intc_suspend(void) { - u32 buf[BITS_TO_U32(max(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))]; + u32 buf[BITS_TO_U32(MAX(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))]; int i; /* Wake IRQs are enabled during system sleep and shutdown. */ diff --git a/drivers/net/can/usb/etas_es58x/es58x_devlink.c b/drivers/net/can/usb/etas_es58x/es58x_devlink.c index 635edeb8f68c..eee20839d96f 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_devlink.c +++ b/drivers/net/can/usb/etas_es58x/es58x_devlink.c @@ -215,7 +215,7 @@ static int es58x_devlink_info_get(struct devlink *devlink, struct es58x_sw_version *fw_ver = &es58x_dev->firmware_version; struct es58x_sw_version *bl_ver = &es58x_dev->bootloader_version; struct es58x_hw_revision *hw_rev = &es58x_dev->hardware_revision; - char buf[max(sizeof("xx.xx.xx"), sizeof("axxx/xxx"))]; + char buf[MAX(sizeof("xx.xx.xx"), sizeof("axxx/xxx"))]; int ret = 0; if (es58x_sw_version_is_valid(fw_ver)) { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 6388786fd8b5..89e009a4c3a3 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -634,7 +634,7 @@ static int check_dir_item(struct extent_buffer *leaf, */ if (key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_XATTR_ITEM_KEY) { - char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + char namebuf[MAX(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index cdd4e2314bfc..2d71b1115916 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1080,7 +1080,7 @@ char *resource_string(char *buf, char *end, struct resource *res, #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") - char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, + char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; char *p = sym, *pend = sym + sizeof(sym); From dc1c8034e31b14a2e5e212104ec508aec44ce1b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 20:24:12 -0700 Subject: [PATCH 0132/1052] minmax: simplify min()/max()/clamp() implementation Now that we no longer have any C constant expression contexts (ie array size declarations or static initializers) that use min() or max(), we can simpify the implementation by not having to worry about the result staying as a C constant expression. So now we can unconditionally just use temporary variables of the right type, and get rid of the excessive expansion that used to come from the use of __builtin_choose_expr(__is_constexpr(...), .. to pick the specialized code for constant expressions. Another expansion simplification is to pass the temporary variables (in addition to the original expression) to our __types_ok() macro. That may superficially look like it complicates the macro, but when we only want the type of the expression, expanding the temporary variable names is much simpler and smaller than expanding the potentially complicated original expression. As a result, on my machine, doing a $ time make drivers/staging/media/atomisp/pci/isp/kernels/ynr/ynr_1.0/ia_css_ynr.host.i goes from real 0m16.621s user 0m15.360s sys 0m1.221s to real 0m2.532s user 0m2.091s sys 0m0.452s because the token expansion goes down dramatically. In particular, the longest line expansion (which was line 71 of that 'ia_css_ynr.host.c' file) shrinks from 23,338kB (yes, 23MB for one single line) to "just" 1,444kB (now "only" 1.4MB). And yes, that line is still the line from hell, because it's doing multiple levels of "min()/max()" expansion thanks to some of them being hidden inside the uDIGIT_FITTING() macro. Lorenzo has a nice cleanup patch that makes that driver use inline functions instead of macros for sDIGIT_FITTING() and uDIGIT_FITTING(), which will fix that line once and for all, but the 16-fold reduction in this case does show why we need to simplify these helpers. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index fc384714da45..e3e4353df983 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -35,10 +35,10 @@ #define __is_noneg_int(x) \ (__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0) -#define __types_ok(x, y) \ - (__is_signed(x) == __is_signed(y) || \ - __is_signed((x) + 0) == __is_signed((y) + 0) || \ - __is_noneg_int(x) || __is_noneg_int(y)) +#define __types_ok(x, y, ux, uy) \ + (__is_signed(ux) == __is_signed(uy) || \ + __is_signed((ux) + 0) == __is_signed((uy) + 0) || \ + __is_noneg_int(x) || __is_noneg_int(y)) #define __cmp_op_min < #define __cmp_op_max > @@ -51,34 +51,31 @@ #define __cmp_once(op, type, x, y) \ __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) -#define __careful_cmp_once(op, x, y) ({ \ - static_assert(__types_ok(x, y), \ +#define __careful_cmp_once(op, x, y, ux, uy) ({ \ + __auto_type ux = (x); __auto_type uy = (y); \ + static_assert(__types_ok(x, y, ux, uy), \ #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp_once(op, __auto_type, x, y); }) + __cmp(op, ux, uy); }) -#define __careful_cmp(op, x, y) \ - __builtin_choose_expr(__is_constexpr((x) - (y)), \ - __cmp(op, x, y), __careful_cmp_once(op, x, y)) +#define __careful_cmp(op, x, y) \ + __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) -#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ - typeof(val) unique_val = (val); \ - typeof(lo) unique_lo = (lo); \ - typeof(hi) unique_hi = (hi); \ +#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ + __auto_type uval = (val); \ + __auto_type ulo = (lo); \ + __auto_type uhi = (hi); \ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error"); \ - static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error"); \ - __clamp(unique_val, unique_lo, unique_hi); }) + static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error"); \ + static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error"); \ + __clamp(uval, ulo, uhi); }) -#define __careful_clamp(val, lo, hi) ({ \ - __builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)), \ - __clamp(val, lo, hi), \ - __clamp_once(val, lo, hi, __UNIQUE_ID(__val), \ - __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); }) +#define __careful_clamp(val, lo, hi) \ + __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) /** * min - return minimum of two values of the same or compatible types From 80d3d33cdf4b0494b465087eb34d2e29d86e290e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 23 Jul 2024 22:11:55 -0700 Subject: [PATCH 0133/1052] xfs: fix a memory leak kmemleak reported that we don't free the parent pointer names here if we found corruption. Fixes: 0d29a20fbdba8 ("xfs: scrub parent pointers") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/parent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 733c410a2279..91e7b51ce068 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -799,7 +799,7 @@ xchk_parent_pptr( } if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out_pp; + goto out_names; /* * Complain if the number of parent pointers doesn't match the link From 39c1ddb064fd38e28571c853f067b134d17cffb2 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 23 Jul 2024 12:26:10 -0500 Subject: [PATCH 0134/1052] xfs: allow SECURE namespace xattrs to use reserved block pool We got a report from the podman folks that selinux relabels that happen as part of their process were returning ENOSPC when the filesystem is completely full. This is because xattr changes reserve about 15 blocks for the worst case, but the common case is for selinux contexts to be the sole, in-inode xattr and consume no blocks. We already allow reserved space consumption for XFS_ATTR_ROOT for things such as ACLs, and SECURE namespace attributes are not so very different, so allow them to use the reserved space as well. Code-comment-by: Dave Chinner Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R V2: Remove local variable, add comment. V3: Add Dave's preferred comment V4: Spelling and comment beautification --- fs/xfs/xfs_xattr.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index ab3d22f662f2..eaf849260bd6 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -110,7 +110,24 @@ xfs_attr_change( args->whichfork = XFS_ATTR_FORK; xfs_attr_sethash(args); - return xfs_attr_set(args, op, args->attr_filter & XFS_ATTR_ROOT); + /* + * Some xattrs must be resistant to allocation failure at ENOSPC, e.g. + * creating an inode with ACLs or security attributes requires the + * allocation of the xattr holding that information to succeed. Hence + * we allow xattrs in the VFS TRUSTED, SYSTEM, POSIX_ACL and SECURITY + * (LSM xattr) namespaces to dip into the reserve block pool to allow + * manipulation of these xattrs when at ENOSPC. These VFS xattr + * namespaces translate to the XFS_ATTR_ROOT and XFS_ATTR_SECURE on-disk + * namespaces. + * + * For most of these cases, these special xattrs will fit in the inode + * itself and so consume no extra space or only require temporary extra + * space while an overwrite is being made. Hence the use of the reserved + * pool is largely to avoid the worst case reservation from preventing + * the xattr from being created at ENOSPC. + */ + return xfs_attr_set(args, op, + args->attr_filter & (XFS_ATTR_ROOT | XFS_ATTR_SECURE)); } From 19ebc8f84ea12e18dd6c8d3ecaf87bcf4666eee1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 10 Jul 2024 22:43:53 -0700 Subject: [PATCH 0135/1052] xfs: fix file_path handling in tracepoints Since file_path() takes the output buffer as one of its arguments, we might as well have it format directly into the tracepoint's char array instead of wasting stack space. Fixes: 3934e8ebb7cc6 ("xfs: create a big array data structure") Fixes: 5076a6040ca16 ("xfs: support in-memory buffer cache targets") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403290419.HPcyvqZu-lkp@intel.com/ Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/trace.h | 10 ++++------ fs/xfs/xfs_trace.h | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 92ef4cdc486e..c886d5d0eb02 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -959,18 +959,16 @@ TRACE_EVENT(xfile_create, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, ino) - __array(char, pathname, 256) + __array(char, pathname, MAXNAMELEN) ), TP_fast_assign( - char pathname[257]; char *path; __entry->ino = file_inode(xf->file)->i_ino; - memset(pathname, 0, sizeof(pathname)); - path = file_path(xf->file, pathname, sizeof(pathname) - 1); + path = file_path(xf->file, __entry->pathname, MAXNAMELEN); if (IS_ERR(path)) - path = "(unknown)"; - strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + strncpy(__entry->pathname, "(unknown)", + sizeof(__entry->pathname)); ), TP_printk("xfino 0x%lx path '%s'", __entry->ino, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5646d300b286..180ce697305a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4715,20 +4715,18 @@ TRACE_EVENT(xmbuf_create, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, ino) - __array(char, pathname, 256) + __array(char, pathname, MAXNAMELEN) ), TP_fast_assign( - char pathname[257]; char *path; struct file *file = btp->bt_file; __entry->dev = btp->bt_mount->m_super->s_dev; __entry->ino = file_inode(file)->i_ino; - memset(pathname, 0, sizeof(pathname)); - path = file_path(file, pathname, sizeof(pathname) - 1); + path = file_path(file, __entry->pathname, MAXNAMELEN); if (IS_ERR(path)) - path = "(unknown)"; - strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + strncpy(__entry->pathname, "(unknown)", + sizeof(__entry->pathname)); ), TP_printk("dev %d:%d xmino 0x%lx path '%s'", MAJOR(__entry->dev), MINOR(__entry->dev), From af5d92f2fad818663da2ce073b6fe15b9d56ffdc Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Sun, 21 Jul 2024 07:27:01 -0400 Subject: [PATCH 0136/1052] xfs: remove unused parameter in macro XFS_DQUOT_LOGRES In the macro definition of XFS_DQUOT_LOGRES, a parameter is accepted, but it is not used. Hence, it should be removed. This patch has only passed compilation test, but it should be fine. Signed-off-by: Julian Sun Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_quota_defs.h | 2 +- fs/xfs/libxfs/xfs_trans_resv.c | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index cb035da3f990..fb05f44f6c75 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -56,7 +56,7 @@ typedef uint8_t xfs_dqtype_t; * And, of course, we also need to take into account the dquot log format item * used to describe each dquot. */ -#define XFS_DQUOT_LOGRES(mp) \ +#define XFS_DQUOT_LOGRES \ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) #define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 3dc8f785bf29..45aaf169806a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -338,11 +338,11 @@ xfs_calc_write_reservation( blksz); t1 += adj; t3 += adj; - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + return XFS_DQUOT_LOGRES + max3(t1, t2, t3); } t4 = xfs_calc_refcountbt_reservation(mp, 1); - return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); + return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3)); } unsigned int @@ -410,11 +410,11 @@ xfs_calc_itruncate_reservation( xfs_refcountbt_block_count(mp, 4), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + return XFS_DQUOT_LOGRES + max3(t1, t2, t3); } t4 = xfs_calc_refcountbt_reservation(mp, 2); - return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); + return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3)); } unsigned int @@ -466,7 +466,7 @@ STATIC uint xfs_calc_rename_reservation( struct xfs_mount *mp) { - unsigned int overhead = XFS_DQUOT_LOGRES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES; struct xfs_trans_resv *resp = M_RES(mp); unsigned int t1, t2, t3 = 0; @@ -577,7 +577,7 @@ STATIC uint xfs_calc_link_reservation( struct xfs_mount *mp) { - unsigned int overhead = XFS_DQUOT_LOGRES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES; struct xfs_trans_resv *resp = M_RES(mp); unsigned int t1, t2, t3 = 0; @@ -641,7 +641,7 @@ STATIC uint xfs_calc_remove_reservation( struct xfs_mount *mp) { - unsigned int overhead = XFS_DQUOT_LOGRES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES; struct xfs_trans_resv *resp = M_RES(mp); unsigned int t1, t2, t3 = 0; @@ -729,7 +729,7 @@ xfs_calc_icreate_reservation( struct xfs_mount *mp) { struct xfs_trans_resv *resp = M_RES(mp); - unsigned int overhead = XFS_DQUOT_LOGRES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES; unsigned int t1, t2, t3 = 0; t1 = xfs_calc_icreate_resv_alloc(mp); @@ -747,7 +747,7 @@ STATIC uint xfs_calc_create_tmpfile_reservation( struct xfs_mount *mp) { - uint res = XFS_DQUOT_LOGRES(mp); + uint res = XFS_DQUOT_LOGRES; res += xfs_calc_icreate_resv_alloc(mp); return res + xfs_calc_iunlink_add_reservation(mp); @@ -829,7 +829,7 @@ STATIC uint xfs_calc_ifree_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + xfs_calc_iunlink_remove_reservation(mp) + @@ -846,7 +846,7 @@ STATIC uint xfs_calc_ichange_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); @@ -955,7 +955,7 @@ STATIC uint xfs_calc_addafork_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + @@ -1003,7 +1003,7 @@ STATIC uint xfs_calc_attrsetm_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); @@ -1043,7 +1043,7 @@ STATIC uint xfs_calc_attrrm_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)) + From 8c2263b9231754f341e204124801c60de969298c Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 9 Jul 2024 15:36:32 +0800 Subject: [PATCH 0137/1052] xfs: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Chen Ni Fixes: 8f4b980ee67f ("xfs: pass the attr value to put_listent when possible") Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_attr_list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 5c947e5ce8b8..7db386304875 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -139,7 +139,7 @@ xfs_attr_shortform_list( sbp->name = sfe->nameval; sbp->namelen = sfe->namelen; /* These are bytes, and both on-disk, don't endian-flip */ - sbp->value = &sfe->nameval[sfe->namelen], + sbp->value = &sfe->nameval[sfe->namelen]; sbp->valuelen = sfe->valuelen; sbp->flags = sfe->flags; sbp->hash = xfs_attr_hashval(dp->i_mount, sfe->flags, From 7bf888fa26e8f22bed4bc3965ab2a2953104ff96 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 16 Jul 2024 16:01:12 +0800 Subject: [PATCH 0138/1052] xfs: convert comma to semicolon Replace a comma between expression statements by a semicolon. Fixes: 178b48d588ea ("xfs: remove the for_each_xbitmap_ helpers") Signed-off-by: Chen Ni Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/scrub/agheader_repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 0dbc484b182f..2f98d90d7fd6 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -696,7 +696,7 @@ xrep_agfl_init_header( * step. */ xagb_bitmap_init(&af.used_extents); - af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), + af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af); error = xagb_bitmap_disunion(agfl_extents, &af.used_extents); if (error) From dd4a799bcc13992dd8be9708e5c585f55226b567 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 26 Jul 2024 16:49:29 +0800 Subject: [PATCH 0139/1052] KVM: riscv: selftests: Fix compile error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix compile error introduced by commit d27c34a73514 ("KVM: riscv: selftests: Add some Zc* extensions to get-reg-list test"). These 4 lines should be end with ";". Fixes: d27c34a73514 ("KVM: riscv: selftests: Add some Zc* extensions to get-reg-list test") Signed-off-by: Yong-Xuan Wang Reviewed-by: Clément Léger Link: https://lore.kernel.org/r/20240726084931.28924-5-yongxuan.wang@sifive.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/riscv/get-reg-list.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index f92c2fb23fcd..8e34f7fa44e9 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -961,10 +961,10 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zbkb, ZBKB); KVM_ISA_EXT_SIMPLE_CONFIG(zbkc, ZBKC); KVM_ISA_EXT_SIMPLE_CONFIG(zbkx, ZBKX); KVM_ISA_EXT_SIMPLE_CONFIG(zbs, ZBS); -KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA), -KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB), -KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD), -KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF), +KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA); +KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB); +KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD); +KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF); KVM_ISA_EXT_SIMPLE_CONFIG(zcmop, ZCMOP); KVM_ISA_EXT_SIMPLE_CONFIG(zfa, ZFA); KVM_ISA_EXT_SIMPLE_CONFIG(zfh, ZFH); From b5fbf924f125ba3638cfdc21c0515eb7e76264ca Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Jul 2024 11:09:54 -0500 Subject: [PATCH 0140/1052] drm/client: Fix error code in drm_client_buffer_vmap_local() This function accidentally returns zero/success on the failure path. It leads to locking issues and an uninitialized *map_copy in the caller. Fixes: b4b0193e83cb ("drm/fbdev-generic: Fix locking with drm_client_buffer_vmap_local()") Signed-off-by: Dan Carpenter Reviewed-by: Dmitry Osipenko Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/89d13df3-747c-4c5d-b122-d081aef5110a@stanley.mountain --- drivers/gpu/drm/drm_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c index 2803ac111bbd..bfedcbf516db 100644 --- a/drivers/gpu/drm/drm_client.c +++ b/drivers/gpu/drm/drm_client.c @@ -355,7 +355,7 @@ int drm_client_buffer_vmap_local(struct drm_client_buffer *buffer, err_drm_gem_vmap_unlocked: drm_gem_unlock(gem); - return 0; + return ret; } EXPORT_SYMBOL(drm_client_buffer_vmap_local); From 291e4baf70019f17a81b7b47aeb186b27d222159 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Mon, 29 Jul 2024 10:46:04 +0800 Subject: [PATCH 0141/1052] kselftests: dmabuf-heaps: Ensure the driver name is null-terminated Even if a vgem device is configured in, we will skip the import_vgem_fd() test almost every time. TAP version 13 1..11 # Testing heap: system # ======================================= # Testing allocation and importing: ok 1 # SKIP Could not open vgem -1 The problem is that we use the DRM_IOCTL_VERSION ioctl to query the driver version information but leave the name field a non-null-terminated string. Terminate it properly to actually test against the vgem device. While at it, let's check the length of the driver name is exactly 4 bytes and return early otherwise (in case there is a name like "vgemfoo" that gets converted to "vgem\0" unexpectedly). Signed-off-by: Zenghui Yu Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20240729024604.2046-1-yuzenghui@huawei.com --- tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c index 5f541522364f..5d0a809dc2df 100644 --- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c +++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c @@ -29,9 +29,11 @@ static int check_vgem(int fd) version.name = name; ret = ioctl(fd, DRM_IOCTL_VERSION, &version); - if (ret) + if (ret || version.name_len != 4) return 0; + name[4] = '\0'; + return !strcmp(name, "vgem"); } From 37c526f00bc1c4f847fc800085f8f009d2e11be6 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 10 Jan 2022 09:28:56 -0800 Subject: [PATCH 0142/1052] i2c: smbus: Improve handling of stuck alerts The following messages were observed while testing alert functionality on systems with multiple I2C devices on a single bus if alert was active on more than one chip. smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0 smbus_alert 3-000c: no driver alert()! and: smbus_alert 3-000c: SMBALERT# from dev 0x28, flag 0 Once it starts, this message repeats forever at high rate. There is no device at any of the reported addresses. Analysis shows that this is seen if multiple devices have the alert pin active. Apparently some devices do not support SMBus arbitration correctly. They keep sending address bits after detecting an address collision and handle the collision not at all or too late. Specifically, address 0x0c is seen with ADT7461A at address 0x4c and ADM1021 at address 0x18 if alert is active on both chips. Address 0x28 is seen with ADT7483 at address 0x2a and ADT7461 at address 0x4c if alert is active on both chips. Once the system is in bad state (alert is set by more than one chip), it often only recovers by power cycling. To reduce the impact of this problem, abort the endless loop in smbus_alert() if the same address is read more than once and not handled by a driver. Fixes: b5527a7766f0 ("i2c: Add SMBus alert support") Signed-off-by: Guenter Roeck [wsa: it also fixed an interrupt storm in one of my experiments] Tested-by: Wolfram Sang [wsa: rebased, moved a comment as well, improved the 'invalid' value] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-smbus.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c index 7e4203df83ed..836c247e7684 100644 --- a/drivers/i2c/i2c-smbus.c +++ b/drivers/i2c/i2c-smbus.c @@ -34,6 +34,7 @@ static int smbus_do_alert(struct device *dev, void *addrp) struct i2c_client *client = i2c_verify_client(dev); struct alert_data *data = addrp; struct i2c_driver *driver; + int ret; if (!client || client->addr != data->addr) return 0; @@ -47,16 +48,21 @@ static int smbus_do_alert(struct device *dev, void *addrp) device_lock(dev); if (client->dev.driver) { driver = to_i2c_driver(client->dev.driver); - if (driver->alert) + if (driver->alert) { + /* Stop iterating after we find the device */ driver->alert(client, data->type, data->data); - else + ret = -EBUSY; + } else { dev_warn(&client->dev, "no driver alert()!\n"); - } else + ret = -EOPNOTSUPP; + } + } else { dev_dbg(&client->dev, "alert with no driver\n"); + ret = -ENODEV; + } device_unlock(dev); - /* Stop iterating after we find the device */ - return -EBUSY; + return ret; } /* @@ -67,6 +73,7 @@ static irqreturn_t smbus_alert(int irq, void *d) { struct i2c_smbus_alert *alert = d; struct i2c_client *ara; + unsigned short prev_addr = I2C_CLIENT_END; /* Not a valid address */ ara = alert->ara; @@ -94,8 +101,19 @@ static irqreturn_t smbus_alert(int irq, void *d) data.addr, data.data); /* Notify driver for the device which issued the alert */ - device_for_each_child(&ara->adapter->dev, &data, - smbus_do_alert); + status = device_for_each_child(&ara->adapter->dev, &data, + smbus_do_alert); + /* + * If we read the same address more than once, and the alert + * was not handled by a driver, it won't do any good to repeat + * the loop because it will never terminate. + * Bail out in this case. + * Note: This assumes that a driver with alert handler handles + * the alert properly and clears it if necessary. + */ + if (data.addr == prev_addr && status != -EBUSY) + break; + prev_addr = data.addr; } return IRQ_HANDLED; From a4765eb49cd94a7653c1a643b03d4facc5d87aa5 Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Wed, 17 Jul 2024 14:25:20 +0200 Subject: [PATCH 0143/1052] irqchip/irq-pic32-evic: Add missing 'static' to internal function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build error reported by gcc 12: drivers/irqchip/irq-pic32-evic.c:164:5: error: no previous prototype for ‘pic32_irq_domain_xlate’ [-Werror=missing-prototypes] 164 | int pic32_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, | ^~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Luca Ceresoli Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240717-irq-pic32-evic-fix-build-static-v1-1-5129085589c6@bootlin.com --- drivers/irqchip/irq-pic32-evic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-pic32-evic.c b/drivers/irqchip/irq-pic32-evic.c index 1d9bb28d13e5..277240325cbc 100644 --- a/drivers/irqchip/irq-pic32-evic.c +++ b/drivers/irqchip/irq-pic32-evic.c @@ -161,9 +161,9 @@ static int pic32_irq_domain_map(struct irq_domain *d, unsigned int virq, return ret; } -int pic32_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, - const u32 *intspec, unsigned int intsize, - irq_hw_number_t *out_hwirq, unsigned int *out_type) +static int pic32_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) { struct evic_chip_data *priv = d->host_data; From 9da49aa80d686582bc3a027112a30484c9be6b6e Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Fri, 26 Jul 2024 06:40:49 +0900 Subject: [PATCH 0144/1052] tun: Add missing bpf_net_ctx_clear() in do_xdp_generic() There are cases where do_xdp_generic returns bpf_net_context without clearing it. This causes various memory corruptions, so the missing bpf_net_ctx_clear must be added. Reported-by: syzbot+44623300f057a28baf1e@syzkaller.appspotmail.com Fixes: fecef4cd42c6 ("tun: Assign missing bpf_net_context.") Signed-off-by: Jeongjun Park Acked-by: Jason Wang Reviewed-by: Willem de Bruijn Reported-by: syzbot+3c2b6d5d4bec3b904933@syzkaller.appspotmail.com Reported-by: syzbot+707d98c8649695eaf329@syzkaller.appspotmail.com Reported-by: syzbot+c226757eb784a9da3e8b@syzkaller.appspotmail.com Reported-by: syzbot+61a1cfc2b6632363d319@syzkaller.appspotmail.com Reported-by: syzbot+709e4c85c904bcd62735@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index 6ea1d20676fb..751d9b70e6ad 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5150,6 +5150,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } + bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: From daefd348a5938d2256d304b57a9e787a83bb58d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:49 -0700 Subject: [PATCH 0145/1052] eth: bnxt: reject unsupported hash functions In commit under Fixes I split the bnxt_set_rxfh_context() function, and attached the appropriate chunks to new ops. I missed that bnxt_set_rxfh_context() gets called after some initial checks in bnxt_set_rxfh(), namely that the hash function is Toeplitz. Fixes: 5c466b4d4e75 ("eth: bnxt: move from .set_rxfh to .create_rxfh_context and friends") Signed-off-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index d00ef0063820..0425a54eca98 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1863,8 +1863,14 @@ static void bnxt_modify_rss(struct bnxt *bp, struct ethtool_rxfh_context *ctx, } static int bnxt_rxfh_context_check(struct bnxt *bp, + const struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { + if (rxfh->hfunc && rxfh->hfunc != ETH_RSS_HASH_TOP) { + NL_SET_ERR_MSG_MOD(extack, "RSS hash function not supported"); + return -EOPNOTSUPP; + } + if (!BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) { NL_SET_ERR_MSG_MOD(extack, "RSS contexts not supported"); return -EOPNOTSUPP; @@ -1888,7 +1894,7 @@ static int bnxt_create_rxfh_context(struct net_device *dev, struct bnxt_vnic_info *vnic; int rc; - rc = bnxt_rxfh_context_check(bp, extack); + rc = bnxt_rxfh_context_check(bp, rxfh, extack); if (rc) return rc; @@ -1953,7 +1959,7 @@ static int bnxt_modify_rxfh_context(struct net_device *dev, struct bnxt_rss_ctx *rss_ctx; int rc; - rc = bnxt_rxfh_context_check(bp, extack); + rc = bnxt_rxfh_context_check(bp, rxfh, extack); if (rc) return rc; From 9dbad38336a9c9a6e77df07c6c770ff6cf55c365 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:50 -0700 Subject: [PATCH 0146/1052] eth: bnxt: populate defaults in the RSS context struct As described in the kdoc for .create_rxfh_context we are responsible for populating the defaults. The core will not call .get_rxfh for non-0 context. The problem can be easily observed since Netlink doesn't currently use the cache. Using netlink ethtool: $ ethtool -x eth0 context 1 [...] RSS hash key: 13:60:cd:60:14:d3:55:36:86:df:90:f2:96:14:e2:21:05:57:a8:8f:a5:12:5e:54:62:7f:fd:3c:15:7e:76:05:71:42:a2:9a:73:80:09:9c RSS hash function: toeplitz: on xor: off crc32: off But using IOCTL ethtool shows: $ ./ethtool-old -x eth0 context 1 [...] RSS hash key: 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 RSS hash function: Operation not supported Fixes: 7964e7884643 ("net: ethtool: use the tracking array for get_rxfh on custom RSS contexts") Signed-off-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 0425a54eca98..ab8e3f197e7b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1921,8 +1921,12 @@ static int bnxt_create_rxfh_context(struct net_device *dev, if (rc) goto out; + /* Populate defaults in the context */ bnxt_set_dflt_rss_indir_tbl(bp, ctx); + ctx->hfunc = ETH_RSS_HASH_TOP; memcpy(vnic->rss_hash_key, bp->rss_hash_key, HW_HASH_KEY_SIZE); + memcpy(ethtool_rxfh_context_key(ctx), + bp->rss_hash_key, HW_HASH_KEY_SIZE); rc = bnxt_hwrm_vnic_alloc(bp, vnic, 0, bp->rx_nr_rings); if (rc) { From 7195f0ef7f5b8c678cf28de7c9b619cb908b482c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:51 -0700 Subject: [PATCH 0147/1052] ethtool: fix setting key and resetting indir at once The indirection table and the key follow struct ethtool_rxfh in user memory. To reset the indirection table user space calls SET_RXFH with table of size 0 (OTOH to say "no change" it should use -1 / ~0). The logic for calculating the offset where they key sits is incorrect in this case, as kernel would still offset by the full table length, while for the reset there is no indir table and key is immediately after the struct. $ ethtool -X eth0 default hkey 01:02:03... $ ethtool -x eth0 [...] RSS hash key: 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 [...] Fixes: 3de0b592394d ("ethtool: Support for configurable RSS hash key") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 983fee76f5cf..a37ba113610a 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1331,13 +1331,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); const struct ethtool_ops *ops = dev->ethtool_ops; u32 dev_indir_size = 0, dev_key_size = 0, i; + u32 user_indir_len = 0, indir_bytes = 0; struct ethtool_rxfh_param rxfh_dev = {}; struct ethtool_rxfh_context *ctx = NULL; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; bool locked = false; /* dev->ethtool->rss_lock taken */ - u32 indir_bytes = 0; bool create = false; u8 *rss_config; int ret; @@ -1400,6 +1400,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { + user_indir_len = indir_bytes; rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, @@ -1426,7 +1427,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.key_size = dev_key_size; rxfh_dev.key = rss_config + indir_bytes; if (copy_from_user(rxfh_dev.key, - useraddr + rss_cfg_offset + indir_bytes, + useraddr + rss_cfg_offset + user_indir_len, rxfh.key_size)) { ret = -EFAULT; goto out; From dc9755370e1c5965d16dff98c9877f5b1847e367 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:52 -0700 Subject: [PATCH 0148/1052] ethtool: fix the state of additional contexts with old API We expect drivers implementing the new create/modify/destroy API to populate the defaults in struct ethtool_rxfh_context. In legacy API ctx isn't even passed, and rxfh.indir / rxfh.key are NULL so drivers can't give us defaults even if they want to. Call get_rxfh() to fetch the values. We can reuse rxfh_dev for the get_rxfh(), rxfh stores the input from the user. This fixes IOCTL reporting 0s instead of the default key / indir table for drivers using legacy API. Add a check to try to catch drivers using the new API but not populating the key. Fixes: 7964e7884643 ("net: ethtool: use the tracking array for get_rxfh on custom RSS contexts") Signed-off-by: Jakub Kicinski Reviewed-by: Edward Cree Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index a37ba113610a..8ca13208d240 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1382,10 +1382,9 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; - if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); + indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); - rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); + rss_config = kzalloc(indir_bytes + dev_key_size, GFP_USER); if (!rss_config) return -ENOMEM; @@ -1475,16 +1474,21 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.input_xfrm = rxfh.input_xfrm; if (rxfh.rss_context && ops->create_rxfh_context) { - if (create) + if (create) { ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack); - else if (rxfh_dev.rss_delete) + /* Make sure driver populates defaults */ + WARN_ON_ONCE(!ret && !rxfh_dev.key && + !memchr_inv(ethtool_rxfh_context_key(ctx), + 0, ctx->key_size)); + } else if (rxfh_dev.rss_delete) { ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context, extack); - else + } else { ret = ops->modify_rxfh_context(dev, ctx, &rxfh_dev, extack); + } } else { ret = ops->set_rxfh(dev, &rxfh_dev, extack); } @@ -1523,6 +1527,22 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, kfree(ctx); goto out; } + + /* Fetch the defaults for the old API, in the new API drivers + * should write defaults into ctx themselves. + */ + rxfh_dev.indir = (u32 *)rss_config; + rxfh_dev.indir_size = dev_indir_size; + + rxfh_dev.key = rss_config + indir_bytes; + rxfh_dev.key_size = dev_key_size; + + ret = ops->get_rxfh(dev, &rxfh_dev); + if (WARN_ON(ret)) { + xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context); + kfree(ctx); + goto out; + } } if (rxfh_dev.rss_delete) { WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context) != ctx); @@ -1531,12 +1551,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (rxfh_dev.indir) { for (i = 0; i < dev_indir_size; i++) ethtool_rxfh_context_indir(ctx)[i] = rxfh_dev.indir[i]; - ctx->indir_configured = 1; + ctx->indir_configured = + rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE; } if (rxfh_dev.key) { memcpy(ethtool_rxfh_context_key(ctx), rxfh_dev.key, dev_key_size); - ctx->key_configured = 1; + ctx->key_configured = !!rxfh.key_size; } if (rxfh_dev.hfunc != ETH_RSS_HASH_NO_CHANGE) ctx->hfunc = rxfh_dev.hfunc; From 0d6ccfe6b319d56da63b7d7cfbcecd92780a680d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:53 -0700 Subject: [PATCH 0149/1052] selftests: drv-net: rss_ctx: check for all-zero keys We had a handful of bugs relating to key being either all 0 or just reported incorrectly as all 0. Check for this in the tests. Signed-off-by: Jakub Kicinski Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/drivers/net/hw/rss_ctx.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 931dbc36ca43..011508ca604b 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -19,6 +19,15 @@ def _rss_key_rand(length): return [random.randint(0, 255) for _ in range(length)] +def _rss_key_check(cfg, data=None, context=0): + if data is None: + data = get_rss(cfg, context=context) + if 'rss-hash-key' not in data: + return + non_zero = [x for x in data['rss-hash-key'] if x != 0] + ksft_eq(bool(non_zero), True, comment=f"RSS key is all zero {data['rss-hash-key']}") + + def get_rss(cfg, context=0): return ethtool(f"-x {cfg.ifname} context {context}", json=True)[0] @@ -90,8 +99,9 @@ def _send_traffic_check(cfg, port, name, params): def test_rss_key_indir(cfg): """Test basics like updating the main RSS key and indirection table.""" - if len(_get_rx_cnts(cfg)) < 2: - KsftSkipEx("Device has only one queue (or doesn't support queue stats)") + qcnt = len(_get_rx_cnts(cfg)) + if qcnt < 3: + KsftSkipEx("Device has fewer than 3 queues (or doesn't support queue stats)") data = get_rss(cfg) want_keys = ['rss-hash-key', 'rss-hash-function', 'rss-indirection-table'] @@ -101,6 +111,7 @@ def test_rss_key_indir(cfg): if not data[k]: raise KsftFailEx(f"ethtool results empty for '{k}': {data[k]}") + _rss_key_check(cfg, data=data) key_len = len(data['rss-hash-key']) # Set the key @@ -110,9 +121,26 @@ def test_rss_key_indir(cfg): data = get_rss(cfg) ksft_eq(key, data['rss-hash-key']) + # Set the indirection table and the key together + key = _rss_key_rand(key_len) + ethtool(f"-X {cfg.ifname} equal 3 hkey " + _rss_key_str(key)) + reset_indir = defer(ethtool, f"-X {cfg.ifname} default") + + data = get_rss(cfg) + _rss_key_check(cfg, data=data) + ksft_eq(0, min(data['rss-indirection-table'])) + ksft_eq(2, max(data['rss-indirection-table'])) + + # Reset indirection table and set the key + key = _rss_key_rand(key_len) + ethtool(f"-X {cfg.ifname} default hkey " + _rss_key_str(key)) + data = get_rss(cfg) + _rss_key_check(cfg, data=data) + ksft_eq(0, min(data['rss-indirection-table'])) + ksft_eq(qcnt - 1, max(data['rss-indirection-table'])) + # Set the indirection table ethtool(f"-X {cfg.ifname} equal 2") - reset_indir = defer(ethtool, f"-X {cfg.ifname} default") data = get_rss(cfg) ksft_eq(0, min(data['rss-indirection-table'])) ksft_eq(1, max(data['rss-indirection-table'])) @@ -317,8 +345,11 @@ def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None): ctx_cnt = i break + _rss_key_check(cfg, context=ctx_id) + if not create_with_cfg: ethtool(f"-X {cfg.ifname} context {ctx_id} {want_cfg}") + _rss_key_check(cfg, context=ctx_id) # Sanity check the context we just created data = get_rss(cfg, ctx_id) From 6623b0217d0c9bed80bfa43b778ce1c0eb03b497 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 21 Jul 2024 18:45:41 +0200 Subject: [PATCH 0150/1052] locking/pvqspinlock: Correct the type of "old" variable in pv_kick_node() "enum vcpu_state" is not compatible with "u8" type for all targets, resulting in: error: initialization of 'u8 *' {aka 'unsigned char *'} from incompatible pointer type 'enum vcpu_state *' for LoongArch. Correct the type of "old" variable to "u8". Fixes: fea0e1820b51 ("locking/pvqspinlock: Use try_cmpxchg() in qspinlock_paravirt.h") Closes: https://lore.kernel.org/lkml/20240719024010.3296488-1-maobibo@loongson.cn/ Reported-by: Bibo Mao Signed-off-by: Uros Bizjak Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20240721164552.50175-1-ubizjak@gmail.com --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f5a36e67b593..ac2e22502741 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - enum vcpu_state old = vcpu_halted; + u8 old = vcpu_halted; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will From b1d0e15c8725d21a73c22c099418a63940261041 Mon Sep 17 00:00:00 2001 From: Zhenyu Wang Date: Wed, 17 Jul 2024 11:16:09 +0800 Subject: [PATCH 0151/1052] perf/x86/intel/cstate: Add pkg C2 residency counter for Sierra Forest Package C2 residency counter is also available on Sierra Forest. So add it support in srf_cstates. Fixes: 3877d55a0db2 ("perf/x86/intel/cstate: Add Sierra Forest support") Signed-off-by: Zhenyu Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Wendy Wang Link: https://lore.kernel.org/r/20240717031609.74513-1-zhenyuw@linux.intel.com --- arch/x86/events/intel/cstate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index be58cfb012dd..9f116dfc4728 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -64,7 +64,7 @@ * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL + * RPL,SPR,MTL,ARL,LNL,SRF * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -693,7 +693,8 @@ static const struct cstate_model srf_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), - .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES), + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C6_RES), .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), }; From 05f76b2d634e65ab34472802d9b142ea9e03f74e Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Fri, 26 Jul 2024 13:41:05 -0700 Subject: [PATCH 0152/1052] tcp: Adjust clamping window for applications specifying SO_RCVBUF tp->scaling_ratio is not updated based on skb->len/skb->truesize once SO_RCVBUF is set leading to the maximum window scaling to be 25% of rcvbuf after commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") and 50% of rcvbuf after commit 697a6c8cec03 ("tcp: increase the default TCP scaling ratio"). 50% tries to emulate the behavior of older kernels using sysctl_tcp_adv_win_scale with default value. Systems which were using a different values of sysctl_tcp_adv_win_scale in older kernels ended up seeing reduced download speeds in certain cases as covered in https://lists.openwall.net/netdev/2024/05/15/13 While the sysctl scheme is no longer acceptable, the value of 50% is a bit conservative when the skb->len/skb->truesize ratio is later determined to be ~0.66. Applications not specifying SO_RCVBUF update the window scaling and the receiver buffer every time data is copied to userspace. This computation is now used for applications setting SO_RCVBUF to update the maximum window scaling while ensuring that the receive buffer is within the application specified limit. Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 454362e359da..e2b9583ed96a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -754,8 +754,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * */ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)) { u64 rcvwin, grow; int rcvbuf; @@ -771,12 +770,22 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); + } + } else { + /* Make the window clamp follow along while being bounded + * by SO_RCVBUF. + */ + int clamp = tcp_win_from_space(sk, min(rcvbuf, sk->sk_rcvbuf)); + + if (clamp > tp->window_clamp) + WRITE_ONCE(tp->window_clamp, clamp); } } tp->rcvq_space.space = copied; From 799a829507506924add8a7620493adc1c3cfda30 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Fri, 26 Jul 2024 15:06:50 +0800 Subject: [PATCH 0153/1052] net: axienet: start napi before enabling Rx/Tx softirq may get lost if an Rx interrupt comes before we call napi_enable. Move napi_enable in front of axienet_setoptions(), which turns on the device, to address the issue. Link: https://lists.gnu.org/archive/html/qemu-devel/2024-07/msg06160.html Fixes: cc37610caaf8 ("net: axienet: implement NAPI and GRO receive") Signed-off-by: Andy Chiu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index e342f387c3dd..02fdf66e07fa 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2219,9 +2219,9 @@ static void axienet_dma_err_handler(struct work_struct *work) ~(XAE_OPTION_TXEN | XAE_OPTION_RXEN)); axienet_set_mac_address(ndev, NULL); axienet_set_multicast_list(ndev); - axienet_setoptions(ndev, lp->options); napi_enable(&lp->napi_rx); napi_enable(&lp->napi_tx); + axienet_setoptions(ndev, lp->options); } /** From 9415d375d8520e0ed55f0c0b058928da9a5b5b3d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 26 Jul 2024 17:19:53 -0700 Subject: [PATCH 0154/1052] rtnetlink: Don't ignore IFLA_TARGET_NETNSID when ifname is specified in rtnl_dellink(). The cited commit accidentally replaced tgt_net with net in rtnl_dellink(). As a result, IFLA_TARGET_NETNSID is ignored if the interface is specified with IFLA_IFNAME or IFLA_ALT_IFNAME. Let's pass tgt_net to rtnl_dev_get(). Fixes: cc6090e985d7 ("net: rtnetlink: introduce helper to get net_device instance by ifname") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 87e67194f240..73fd7f543fd0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3288,7 +3288,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(tgt_net, tb); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else From 3c0b6f924e1259ade38587ea719b693f6f6f2f3e Mon Sep 17 00:00:00 2001 From: Mavroudis Chatzilazaridis Date: Sun, 28 Jul 2024 12:36:04 +0000 Subject: [PATCH 0155/1052] ALSA: hda/realtek: Add quirk for Acer Aspire E5-574G ALC255_FIXUP_ACER_LIMIT_INT_MIC_BOOST fixes combo jack detection and limits the internal microphone boost that causes clipping on this model. Signed-off-by: Mavroudis Chatzilazaridis Cc: Link: https://patch.msgid.link/20240728123601.144017-1-mavchatz@protonmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ba0ce8750ca4..1645d21d422f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9872,6 +9872,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x079b, "Acer Aspire V5-573G", ALC282_FIXUP_ASPIRE_V5_PINS), SND_PCI_QUIRK(0x1025, 0x080d, "Acer Aspire V5-122P", ALC269_FIXUP_ASPIRE_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x0840, "Acer Aspire E1", ALC269VB_FIXUP_ASPIRE_E1_COEF), + SND_PCI_QUIRK(0x1025, 0x100c, "Acer Aspire E5-574G", ALC255_FIXUP_ACER_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1025, 0x101c, "Acer Veriton N2510G", ALC269_FIXUP_LIFEBOOK), SND_PCI_QUIRK(0x1025, 0x102b, "Acer Aspire C24-860", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1065, "Acer Aspire C20-820", ALC269VC_FIXUP_ACER_HEADSET_MIC), From 4bc14b9cfaa2149d41baef2f2620e9f82d9847d7 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 28 Jun 2024 17:56:43 -0700 Subject: [PATCH 0156/1052] i915/perf: Remove code to update PWR_CLK_STATE for gen12 PWR_CLK_STATE only needs to be modified up until gen11. For gen12 this code is not applicable. Remove code to update context image with PWR_CLK_STATE for gen12. Fixes: 00a7f0d7155c ("drm/i915/tgl: Add perf support on TGL") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20240629005643.3050678-1-umesh.nerlige.ramappa@intel.com (cherry picked from commit 7b5bdae7740eb6a3d09f9cd4e4b07362a15b86b3) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_perf.c | 33 -------------------------------- 1 file changed, 33 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 0b1cd4c7a525..025a79fe5920 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -2748,26 +2748,6 @@ oa_configure_all_contexts(struct i915_perf_stream *stream, return 0; } -static int -gen12_configure_all_contexts(struct i915_perf_stream *stream, - const struct i915_oa_config *oa_config, - struct i915_active *active) -{ - struct flex regs[] = { - { - GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE), - CTX_R_PWR_CLK_STATE, - }, - }; - - if (stream->engine->class != RENDER_CLASS) - return 0; - - return oa_configure_all_contexts(stream, - regs, ARRAY_SIZE(regs), - active); -} - static int lrc_configure_all_contexts(struct i915_perf_stream *stream, const struct i915_oa_config *oa_config, @@ -2874,7 +2854,6 @@ gen12_enable_metric_set(struct i915_perf_stream *stream, { struct drm_i915_private *i915 = stream->perf->i915; struct intel_uncore *uncore = stream->uncore; - struct i915_oa_config *oa_config = stream->oa_config; bool periodic = stream->periodic; u32 period_exponent = stream->period_exponent; u32 sqcnt1; @@ -2918,15 +2897,6 @@ gen12_enable_metric_set(struct i915_perf_stream *stream, intel_uncore_rmw(uncore, GEN12_SQCNT1, 0, sqcnt1); - /* - * Update all contexts prior writing the mux configurations as we need - * to make sure all slices/subslices are ON before writing to NOA - * registers. - */ - ret = gen12_configure_all_contexts(stream, oa_config, active); - if (ret) - return ret; - /* * For Gen12, performance counters are context * saved/restored. Only enable it for the context that @@ -2980,9 +2950,6 @@ static void gen12_disable_metric_set(struct i915_perf_stream *stream) _MASKED_BIT_DISABLE(GEN12_DISABLE_DOP_GATING)); } - /* Reset all contexts' slices/subslices configurations. */ - gen12_configure_all_contexts(stream, NULL, NULL); - /* disable the context save/restore or OAR counters */ if (stream->ctx) gen12_configure_oar_context(stream, NULL); From 0ba521d6948ecb4acf1276494dfed127fe096ca6 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 25 Jul 2024 20:46:44 +0200 Subject: [PATCH 0157/1052] rust: macros: indent list item in `module!`'s docs Like commit e516211f615f ("rust: macros: indent list item in `paste!`'s docs"), but for `module!`. Reviewed-by: Trevor Gross Link: https://lore.kernel.org/r/20240725184644.135185-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 159e75292970..5be0cb9db3ee 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -94,7 +94,7 @@ /// - `license`: ASCII string literal of the license of the kernel module (required). /// - `alias`: array of ASCII string literals of the alias names of the kernel module. /// - `firmware`: array of ASCII string literals of the firmware files of -/// the kernel module. +/// the kernel module. #[proc_macro] pub fn module(ts: TokenStream) -> TokenStream { module::module(ts) From 167b93258d1e2230ee3e8a97669b4db4cc9e90aa Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:03:59 +0200 Subject: [PATCH 0158/1052] mptcp: fix user-space PM announced address accounting Currently the per-connection announced address counter is never decreased. When the user-space PM is in use, this just affect the information exposed via diag/sockopt, but it could still foul the PM to wrong decision. Add the missing accounting for the user-space PM's sake. Fixes: 8b1c94da1e48 ("mptcp: only send RM_ADDR in nl_cmd_remove") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ea9e5817b9e9..b399f2b7a369 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1534,16 +1534,25 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; + int anno_nr = 0; list_for_each_entry(entry, rm_list, list) { - if ((remove_anno_list_by_saddr(msk, &entry->addr) || - lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) && - alist.nr < MPTCP_RM_IDS_MAX) - alist.ids[alist.nr++] = entry->addr.id; + if (alist.nr >= MPTCP_RM_IDS_MAX) + break; + + /* only delete if either announced or matching a subflow */ + if (remove_anno_list_by_saddr(msk, &entry->addr)) + anno_nr++; + else if (!lookup_subflow_by_saddr(&msk->conn_list, + &entry->addr)) + continue; + + alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= anno_nr; mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } From 4b317e0eb287bd30a1b329513531157c25e8b692 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:04:00 +0200 Subject: [PATCH 0159/1052] mptcp: fix NL PM announced address accounting Currently the per connection announced address counter is never decreased. As a consequence, after connection establishment, if the NL PM deletes an endpoint and adds a new/different one, no additional subflow is created for the new endpoint even if the current limits allow that. Address the issue properly updating the signaled address counter every time the NL PM removes such addresses. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index b399f2b7a369..f65831de5c1a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1401,6 +1401,7 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= ret; mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } @@ -1565,17 +1566,18 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { - if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - slist.nr < MPTCP_RM_IDS_MAX) + if (slist.nr < MPTCP_RM_IDS_MAX && + lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) slist.ids[slist.nr++] = entry->addr.id; - if (remove_anno_list_by_saddr(msk, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX) + if (alist.nr < MPTCP_RM_IDS_MAX && + remove_anno_list_by_saddr(msk, &entry->addr)) alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= alist.nr; mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } From b5e2fb832f48bc01d937a053e0550a1465a2f05d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:04:01 +0200 Subject: [PATCH 0160/1052] selftests: mptcp: add explicit test case for remove/readd Delete and re-create a signal endpoint and ensure that the PM actually deletes and re-create the subflow. Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- .../testing/selftests/net/mptcp/mptcp_join.sh | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 108aeeb84ef1..9c091fc267c4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3526,6 +3526,35 @@ endpoint_tests() chk_mptcp_info subflows 1 subflows 1 mptcp_lib_kill_wait $tests_pid fi + + # remove and re-add + if reset "delete re-add signal" && + mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 1 1 + pm_nl_set_limits $ns2 1 1 + pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + test_linkfail=4 speed=20 \ + run_tests $ns1 $ns2 10.0.1.1 & + local tests_pid=$! + + wait_mpj $ns2 + pm_nl_check_endpoint "creation" \ + $ns1 10.0.2.1 id 1 flags signal + chk_subflow_nr "before delete" 2 + chk_mptcp_info subflows 1 subflows 1 + + pm_nl_del_endpoint $ns1 1 10.0.2.1 + sleep 0.5 + chk_subflow_nr "after delete" 1 + chk_mptcp_info subflows 0 subflows 0 + + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-add" 2 + chk_mptcp_info subflows 1 subflows 1 + mptcp_lib_kill_wait $tests_pid + fi + } # [$1: error message] From 4a2f48992ddf4b8c2fba846c6754089edae6db5a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:04:02 +0200 Subject: [PATCH 0161/1052] selftests: mptcp: fix error path pm_nl_check_endpoint() currently calls an not existing helper to mark the test as failed. Fix the wrong call. Fixes: 03668c65d153 ("selftests: mptcp: join: rework detailed report") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 9c091fc267c4..55d84a1bde15 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -661,7 +661,7 @@ pm_nl_check_endpoint() done if [ -z "${id}" ]; then - test_fail "bad test - missing endpoint id" + fail_test "bad test - missing endpoint id" return fi From 7c70bcc2a84cf925f655ea1ac4b8088062b144a3 Mon Sep 17 00:00:00 2001 From: Liu Jing Date: Sat, 27 Jul 2024 11:04:03 +0200 Subject: [PATCH 0162/1052] selftests: mptcp: always close input's FD if opened In main_loop_s function, when the open(cfg_input, O_RDONLY) function is run, the last fd is not closed if the "--cfg_repeat > 0" branch is not taken. Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Cc: stable@vger.kernel.org Signed-off-by: Liu Jing Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index d2043ec3bf6d..4209b9569039 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1115,11 +1115,11 @@ int main_loop_s(int listensock) return 1; } - if (--cfg_repeat > 0) { - if (cfg_input) - close(fd); + if (cfg_input) + close(fd); + + if (--cfg_repeat > 0) goto again; - } return 0; } From 4ddd51ccff911a2e9e961307692532a325f6c78a Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Thu, 25 Jul 2024 16:54:53 +0800 Subject: [PATCH 0163/1052] ASoC: fsl_micfil: Expand the range of FIFO watermark mask On the i.MX9x platforms, the mask of FIFO watermark is 0x1F, on i.MX8x platforms, the mask of FIFO watermark is 0X7. So use the mask 0x1F for all platforms to make them compatible. Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/1721897694-6088-2-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_micfil.c | 2 +- sound/soc/fsl/fsl_micfil.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/fsl/fsl_micfil.c b/sound/soc/fsl/fsl_micfil.c index 0d37edb70261..96a6b88d0d67 100644 --- a/sound/soc/fsl/fsl_micfil.c +++ b/sound/soc/fsl/fsl_micfil.c @@ -831,7 +831,7 @@ static const struct reg_default fsl_micfil_reg_defaults[] = { {REG_MICFIL_CTRL1, 0x00000000}, {REG_MICFIL_CTRL2, 0x00000000}, {REG_MICFIL_STAT, 0x00000000}, - {REG_MICFIL_FIFO_CTRL, 0x00000007}, + {REG_MICFIL_FIFO_CTRL, 0x0000001F}, {REG_MICFIL_FIFO_STAT, 0x00000000}, {REG_MICFIL_DATACH0, 0x00000000}, {REG_MICFIL_DATACH1, 0x00000000}, diff --git a/sound/soc/fsl/fsl_micfil.h b/sound/soc/fsl/fsl_micfil.h index c6b902ba0a53..b7798a7cbf2a 100644 --- a/sound/soc/fsl/fsl_micfil.h +++ b/sound/soc/fsl/fsl_micfil.h @@ -72,7 +72,7 @@ #define MICFIL_STAT_CHXF(ch) BIT(ch) /* MICFIL FIFO Control Register -- REG_MICFIL_FIFO_CTRL 0x10 */ -#define MICFIL_FIFO_CTRL_FIFOWMK GENMASK(2, 0) +#define MICFIL_FIFO_CTRL_FIFOWMK GENMASK(4, 0) /* MICFIL FIFO Status Register -- REG_MICFIL_FIFO_STAT 0x14 */ #define MICFIL_FIFO_STAT_FIFOX_OVER(ch) BIT(ch) From aa4f76ef09a993efa9b5fab6ddf5d6d324baaea3 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Thu, 25 Jul 2024 16:54:54 +0800 Subject: [PATCH 0164/1052] ASoC: fsl_micfil: Differentiate register access permission for platforms On i.MX9x platforms, the REG_MICFIL_FSYNC_CTRL, REG_MICFIL_VERID, REG_MICFIL_PARAM are added, but they are not existed on i.MX8x platforms. Use the existed micfil->soc->use_verid to distinguish the access permission for these platforms. Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/1721897694-6088-3-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_micfil.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sound/soc/fsl/fsl_micfil.c b/sound/soc/fsl/fsl_micfil.c index 96a6b88d0d67..22b240a70ad4 100644 --- a/sound/soc/fsl/fsl_micfil.c +++ b/sound/soc/fsl/fsl_micfil.c @@ -855,6 +855,8 @@ static const struct reg_default fsl_micfil_reg_defaults[] = { static bool fsl_micfil_readable_reg(struct device *dev, unsigned int reg) { + struct fsl_micfil *micfil = dev_get_drvdata(dev); + switch (reg) { case REG_MICFIL_CTRL1: case REG_MICFIL_CTRL2: @@ -872,9 +874,6 @@ static bool fsl_micfil_readable_reg(struct device *dev, unsigned int reg) case REG_MICFIL_DC_CTRL: case REG_MICFIL_OUT_CTRL: case REG_MICFIL_OUT_STAT: - case REG_MICFIL_FSYNC_CTRL: - case REG_MICFIL_VERID: - case REG_MICFIL_PARAM: case REG_MICFIL_VAD0_CTRL1: case REG_MICFIL_VAD0_CTRL2: case REG_MICFIL_VAD0_STAT: @@ -883,6 +882,12 @@ static bool fsl_micfil_readable_reg(struct device *dev, unsigned int reg) case REG_MICFIL_VAD0_NDATA: case REG_MICFIL_VAD0_ZCD: return true; + case REG_MICFIL_FSYNC_CTRL: + case REG_MICFIL_VERID: + case REG_MICFIL_PARAM: + if (micfil->soc->use_verid) + return true; + fallthrough; default: return false; } @@ -890,6 +895,8 @@ static bool fsl_micfil_readable_reg(struct device *dev, unsigned int reg) static bool fsl_micfil_writeable_reg(struct device *dev, unsigned int reg) { + struct fsl_micfil *micfil = dev_get_drvdata(dev); + switch (reg) { case REG_MICFIL_CTRL1: case REG_MICFIL_CTRL2: @@ -899,7 +906,6 @@ static bool fsl_micfil_writeable_reg(struct device *dev, unsigned int reg) case REG_MICFIL_DC_CTRL: case REG_MICFIL_OUT_CTRL: case REG_MICFIL_OUT_STAT: /* Write 1 to Clear */ - case REG_MICFIL_FSYNC_CTRL: case REG_MICFIL_VAD0_CTRL1: case REG_MICFIL_VAD0_CTRL2: case REG_MICFIL_VAD0_STAT: /* Write 1 to Clear */ @@ -907,6 +913,10 @@ static bool fsl_micfil_writeable_reg(struct device *dev, unsigned int reg) case REG_MICFIL_VAD0_NCONFIG: case REG_MICFIL_VAD0_ZCD: return true; + case REG_MICFIL_FSYNC_CTRL: + if (micfil->soc->use_verid) + return true; + fallthrough; default: return false; } From aebb1813c279ce8f3a2dfa3f86def0c0ec1cbb8d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 26 Jul 2024 16:10:41 +0200 Subject: [PATCH 0165/1052] ASoC: codecs: wcd937x-sdw: Correct Soundwire ports mask Device has up to WCD937X_MAX_TX_SWR_PORTS (or WCD937X_MAX_SWR_PORTS for sink) number of ports and the array assigned to prop.src_dpn_prop and prop.sink_dpn_prop has 0..WCD937X_MAX_TX_SWR_PORTS-1 elements. On the other hand, GENMASK(high, low) creates an inclusive mask between , so we need the mask from 0 up to WCD937X_MAX_TX_SWR_PORTS-1. Theoretically, too wide mask could cause an out of bounds read in sdw_get_slave_dpn_prop() in stream.c, however only in the case of buggy driver, e.g. adding incorrect number of ports via sdw_stream_add_slave(). Fixes: c99a515ff153 ("ASoC: codecs: wcd937x-sdw: add SoundWire driver") Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240726-asoc-wcd-wsa-swr-ports-genmask-v1-1-d4d7a8b56f05@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd937x-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wcd937x-sdw.c b/sound/soc/codecs/wcd937x-sdw.c index 3abc8041406a..0c33f7f3dc25 100644 --- a/sound/soc/codecs/wcd937x-sdw.c +++ b/sound/soc/codecs/wcd937x-sdw.c @@ -1049,7 +1049,7 @@ static int wcd9370_probe(struct sdw_slave *pdev, pdev->prop.lane_control_support = true; pdev->prop.simple_clk_stop_capable = true; if (wcd->is_tx) { - pdev->prop.source_ports = GENMASK(WCD937X_MAX_TX_SWR_PORTS, 0); + pdev->prop.source_ports = GENMASK(WCD937X_MAX_TX_SWR_PORTS - 1, 0); pdev->prop.src_dpn_prop = wcd937x_dpn_prop; wcd->ch_info = &wcd937x_sdw_tx_ch_info[0]; pdev->prop.wake_capable = true; @@ -1062,7 +1062,7 @@ static int wcd9370_probe(struct sdw_slave *pdev, /* Start in cache-only until device is enumerated */ regcache_cache_only(wcd->regmap, true); } else { - pdev->prop.sink_ports = GENMASK(WCD937X_MAX_SWR_PORTS, 0); + pdev->prop.sink_ports = GENMASK(WCD937X_MAX_SWR_PORTS - 1, 0); pdev->prop.sink_dpn_prop = wcd937x_dpn_prop; wcd->ch_info = &wcd937x_sdw_rx_ch_info[0]; } From 3f6fb03dae9c7dfba7670858d29e03c8faaa89fe Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 26 Jul 2024 16:10:42 +0200 Subject: [PATCH 0166/1052] ASoC: codecs: wcd938x-sdw: Correct Soundwire ports mask Device has up to WCD938X_MAX_SWR_PORTS number of ports and the array assigned to prop.src_dpn_prop and prop.sink_dpn_prop has 0..WCD938X_MAX_SWR_PORTS-1 elements. On the other hand, GENMASK(high, low) creates an inclusive mask between , so we need the mask from 0 up to WCD938X_MAX_SWR_PORTS-1. Theoretically, too wide mask could cause an out of bounds read in sdw_get_slave_dpn_prop() in stream.c, however only in the case of buggy driver, e.g. adding incorrect number of ports via sdw_stream_add_slave(). Fixes: 16572522aece ("ASoC: codecs: wcd938x-sdw: add SoundWire driver") Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240726-asoc-wcd-wsa-swr-ports-genmask-v1-2-d4d7a8b56f05@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd938x-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wcd938x-sdw.c b/sound/soc/codecs/wcd938x-sdw.c index c995bcc59ead..7da8a10bd0a9 100644 --- a/sound/soc/codecs/wcd938x-sdw.c +++ b/sound/soc/codecs/wcd938x-sdw.c @@ -1252,12 +1252,12 @@ static int wcd9380_probe(struct sdw_slave *pdev, pdev->prop.lane_control_support = true; pdev->prop.simple_clk_stop_capable = true; if (wcd->is_tx) { - pdev->prop.source_ports = GENMASK(WCD938X_MAX_SWR_PORTS, 0); + pdev->prop.source_ports = GENMASK(WCD938X_MAX_SWR_PORTS - 1, 0); pdev->prop.src_dpn_prop = wcd938x_dpn_prop; wcd->ch_info = &wcd938x_sdw_tx_ch_info[0]; pdev->prop.wake_capable = true; } else { - pdev->prop.sink_ports = GENMASK(WCD938X_MAX_SWR_PORTS, 0); + pdev->prop.sink_ports = GENMASK(WCD938X_MAX_SWR_PORTS - 1, 0); pdev->prop.sink_dpn_prop = wcd938x_dpn_prop; wcd->ch_info = &wcd938x_sdw_rx_ch_info[0]; } From 74a79977c4e1d09eced33e6e22f875a5bb3fad29 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 26 Jul 2024 16:10:43 +0200 Subject: [PATCH 0167/1052] ASoC: codecs: wcd939x-sdw: Correct Soundwire ports mask Device has up to WCD939X_MAX_TX_SWR_PORTS (or WCD939X_MAX_RX_SWR_PORTS for sink) number of ports and the array assigned to prop.src_dpn_prop and prop.sink_dpn_prop has 0..WCD939X_MAX_TX_SWR_PORTS-1 elements. On the other hand, GENMASK(high, low) creates an inclusive mask between , so we need the mask from 0 up to WCD939X_MAX_TX_SWR_PORTS-1. Theoretically, too wide mask could cause an out of bounds read in sdw_get_slave_dpn_prop() in stream.c, however only in the case of buggy driver, e.g. adding incorrect number of ports via sdw_stream_add_slave(). Fixes: be2af391cea0 ("ASoC: codecs: Add WCD939x Soundwire devices driver") Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240726-asoc-wcd-wsa-swr-ports-genmask-v1-3-d4d7a8b56f05@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd939x-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index 94b1e99a3ca0..fca95777a75a 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -1453,12 +1453,12 @@ static int wcd9390_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) pdev->prop.lane_control_support = true; pdev->prop.simple_clk_stop_capable = true; if (wcd->is_tx) { - pdev->prop.source_ports = GENMASK(WCD939X_MAX_TX_SWR_PORTS, 0); + pdev->prop.source_ports = GENMASK(WCD939X_MAX_TX_SWR_PORTS - 1, 0); pdev->prop.src_dpn_prop = wcd939x_tx_dpn_prop; wcd->ch_info = &wcd939x_sdw_tx_ch_info[0]; pdev->prop.wake_capable = true; } else { - pdev->prop.sink_ports = GENMASK(WCD939X_MAX_RX_SWR_PORTS, 0); + pdev->prop.sink_ports = GENMASK(WCD939X_MAX_RX_SWR_PORTS - 1, 0); pdev->prop.sink_dpn_prop = wcd939x_rx_dpn_prop; wcd->ch_info = &wcd939x_sdw_rx_ch_info[0]; } From eb11c3bb64ad0a05aeacdb01039863aa2aa3614b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 26 Jul 2024 16:10:44 +0200 Subject: [PATCH 0168/1052] ASoC: codecs: wsa881x: Correct Soundwire ports mask Device has up to WSA881X_MAX_SWR_PORTS number of ports and the array assigned to prop.sink_dpn_prop has 0..WSA881X_MAX_SWR_PORTS-1 elements. On the other hand, GENMASK(high, low) creates an inclusive mask between , so we need the mask from 0 up to WSA881X_MAX_SWR_PORTS-1. Theoretically, too wide mask could cause an out of bounds read in sdw_get_slave_dpn_prop() in stream.c, however only in the case of buggy driver, e.g. adding incorrect number of ports via sdw_stream_add_slave(). Fixes: a0aab9e1404a ("ASoC: codecs: add wsa881x amplifier support") Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240726-asoc-wcd-wsa-swr-ports-genmask-v1-4-d4d7a8b56f05@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wsa881x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wsa881x.c b/sound/soc/codecs/wsa881x.c index 0478599d0f35..fb9e92f08d98 100644 --- a/sound/soc/codecs/wsa881x.c +++ b/sound/soc/codecs/wsa881x.c @@ -1152,7 +1152,7 @@ static int wsa881x_probe(struct sdw_slave *pdev, wsa881x->sconfig.frame_rate = 48000; wsa881x->sconfig.direction = SDW_DATA_DIR_RX; wsa881x->sconfig.type = SDW_STREAM_PDM; - pdev->prop.sink_ports = GENMASK(WSA881X_MAX_SWR_PORTS, 0); + pdev->prop.sink_ports = GENMASK(WSA881X_MAX_SWR_PORTS - 1, 0); pdev->prop.sink_dpn_prop = wsa_sink_dpn_prop; pdev->prop.scp_int1_mask = SDW_SCP_INT1_BUS_CLASH | SDW_SCP_INT1_PARITY; pdev->prop.clk_stop_mode1 = true; From 6801ac36f25690e14955f7f9eace1eaa29edbdd0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 26 Jul 2024 16:10:45 +0200 Subject: [PATCH 0169/1052] ASoC: codecs: wsa883x: Correct Soundwire ports mask Device has up to WSA883X_MAX_SWR_PORTS number of ports and the array assigned to prop.sink_dpn_prop has 0..WSA883X_MAX_SWR_PORTS-1 elements. On the other hand, GENMASK(high, low) creates an inclusive mask between , so we need the mask from 0 up to WSA883X_MAX_SWR_PORTS-1. Theoretically, too wide mask could cause an out of bounds read in sdw_get_slave_dpn_prop() in stream.c, however only in the case of buggy driver, e.g. adding incorrect number of ports via sdw_stream_add_slave(). Fixes: 43b8c7dc85a1 ("ASoC: codecs: add wsa883x amplifier support") Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240726-asoc-wcd-wsa-swr-ports-genmask-v1-5-d4d7a8b56f05@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wsa883x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wsa883x.c b/sound/soc/codecs/wsa883x.c index d0ab4e2290b6..3e4fdaa3f44f 100644 --- a/sound/soc/codecs/wsa883x.c +++ b/sound/soc/codecs/wsa883x.c @@ -1406,7 +1406,7 @@ static int wsa883x_probe(struct sdw_slave *pdev, WSA883X_MAX_SWR_PORTS)) dev_dbg(dev, "Static Port mapping not specified\n"); - pdev->prop.sink_ports = GENMASK(WSA883X_MAX_SWR_PORTS, 0); + pdev->prop.sink_ports = GENMASK(WSA883X_MAX_SWR_PORTS - 1, 0); pdev->prop.simple_clk_stop_capable = true; pdev->prop.sink_dpn_prop = wsa_sink_dpn_prop; pdev->prop.scp_int1_mask = SDW_SCP_INT1_BUS_CLASH | SDW_SCP_INT1_PARITY; From dcb6631d05152930e2ea70fd2abfd811b0e970b5 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 26 Jul 2024 16:10:46 +0200 Subject: [PATCH 0170/1052] ASoC: codecs: wsa884x: Correct Soundwire ports mask Device has up to WSA884X_MAX_SWR_PORTS number of ports and the array assigned to prop.sink_dpn_prop has 0..WSA884X_MAX_SWR_PORTS-1 elements. On the other hand, GENMASK(high, low) creates an inclusive mask between , so we need the mask from 0 up to WSA884X_MAX_SWR_PORTS-1. Theoretically, too wide mask could cause an out of bounds read in sdw_get_slave_dpn_prop() in stream.c, however only in the case of buggy driver, e.g. adding incorrect number of ports via sdw_stream_add_slave(). Fixes: aa21a7d4f68a ("ASoC: codecs: wsa884x: Add WSA884x family of speakers") Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240726-asoc-wcd-wsa-swr-ports-genmask-v1-6-d4d7a8b56f05@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wsa884x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wsa884x.c b/sound/soc/codecs/wsa884x.c index d17ae17b2938..89eb5e03a617 100644 --- a/sound/soc/codecs/wsa884x.c +++ b/sound/soc/codecs/wsa884x.c @@ -1895,7 +1895,7 @@ static int wsa884x_probe(struct sdw_slave *pdev, WSA884X_MAX_SWR_PORTS)) dev_dbg(dev, "Static Port mapping not specified\n"); - pdev->prop.sink_ports = GENMASK(WSA884X_MAX_SWR_PORTS, 0); + pdev->prop.sink_ports = GENMASK(WSA884X_MAX_SWR_PORTS - 1, 0); pdev->prop.simple_clk_stop_capable = true; pdev->prop.sink_dpn_prop = wsa884x_sink_dpn_prop; pdev->prop.scp_int1_mask = SDW_SCP_INT1_BUS_CLASH | SDW_SCP_INT1_PARITY; From 6b99068d5ea0aa295f15f30afc98db74d056ec7b Mon Sep 17 00:00:00 2001 From: Jerome Audu Date: Sat, 27 Jul 2024 15:40:15 +0200 Subject: [PATCH 0171/1052] ASoC: sti: add missing probe entry for player and reader This patch addresses a regression in the ASoC STI drivers that was introduced in Linux version 6.6.y. The issue originated from a series of patches (see https://lore.kernel.org/all/87wmy5b0wt.wl-kuninori.morimoto.gx@renesas.com/) that unintentionally omitted necessary probe functions for the player and reader components. Probe function in `sound/soc/sti/sti_uniperif.c:415` is being replaced by another probe function located at `sound/soc/sti/sti_uniperif.c:453`, which should instead be derived from the player and reader components. This patch correctly reinserts the missing probe entries, restoring the intended functionality. Fixes: 9f625f5e6cf9 ("ASoC: sti: merge DAI call back functions into ops") Signed-off-by: Jerome Audu Link: https://patch.msgid.link/20240727-sti-audio-fix-v2-1-208bde546c3f@free.fr Signed-off-by: Mark Brown --- sound/soc/sti/sti_uniperif.c | 2 +- sound/soc/sti/uniperif.h | 1 + sound/soc/sti/uniperif_player.c | 1 + sound/soc/sti/uniperif_reader.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/soc/sti/sti_uniperif.c b/sound/soc/sti/sti_uniperif.c index ba824f14a39c..a7956e5a4ee5 100644 --- a/sound/soc/sti/sti_uniperif.c +++ b/sound/soc/sti/sti_uniperif.c @@ -352,7 +352,7 @@ static int sti_uniperiph_resume(struct snd_soc_component *component) return ret; } -static int sti_uniperiph_dai_probe(struct snd_soc_dai *dai) +int sti_uniperiph_dai_probe(struct snd_soc_dai *dai) { struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai); struct sti_uniperiph_dai *dai_data = &priv->dai_data; diff --git a/sound/soc/sti/uniperif.h b/sound/soc/sti/uniperif.h index 2a5de328501c..74e51f0ff85c 100644 --- a/sound/soc/sti/uniperif.h +++ b/sound/soc/sti/uniperif.h @@ -1380,6 +1380,7 @@ int uni_reader_init(struct platform_device *pdev, struct uniperif *reader); /* common */ +int sti_uniperiph_dai_probe(struct snd_soc_dai *dai); int sti_uniperiph_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt); diff --git a/sound/soc/sti/uniperif_player.c b/sound/soc/sti/uniperif_player.c index dd9013c47664..6d1ce030963c 100644 --- a/sound/soc/sti/uniperif_player.c +++ b/sound/soc/sti/uniperif_player.c @@ -1038,6 +1038,7 @@ static const struct snd_soc_dai_ops uni_player_dai_ops = { .startup = uni_player_startup, .shutdown = uni_player_shutdown, .prepare = uni_player_prepare, + .probe = sti_uniperiph_dai_probe, .trigger = uni_player_trigger, .hw_params = sti_uniperiph_dai_hw_params, .set_fmt = sti_uniperiph_dai_set_fmt, diff --git a/sound/soc/sti/uniperif_reader.c b/sound/soc/sti/uniperif_reader.c index 065c5f0d1f5f..05ea2b794eb9 100644 --- a/sound/soc/sti/uniperif_reader.c +++ b/sound/soc/sti/uniperif_reader.c @@ -401,6 +401,7 @@ static const struct snd_soc_dai_ops uni_reader_dai_ops = { .startup = uni_reader_startup, .shutdown = uni_reader_shutdown, .prepare = uni_reader_prepare, + .probe = sti_uniperiph_dai_probe, .trigger = uni_reader_trigger, .hw_params = sti_uniperiph_dai_hw_params, .set_fmt = sti_uniperiph_dai_set_fmt, From c118478665f467e57d06b2354de65974b246b82b Mon Sep 17 00:00:00 2001 From: Bruno Ancona Date: Sun, 28 Jul 2024 22:50:32 -0600 Subject: [PATCH 0172/1052] ASoC: amd: yc: Support mic on HP 14-em0002la Add support for the internal microphone for HP 14-em0002la laptop using a quirk entry. Signed-off-by: Bruno Ancona Link: https://patch.msgid.link/20240729045032.223230-1-brunoanconasala@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 1769e07e83dc..f4bbfffe9fcb 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -423,6 +423,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_BOARD_NAME, "8A3E"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "HP"), + DMI_MATCH(DMI_BOARD_NAME, "8B27"), + } + }, { .driver_data = &acp6x_card, .matches = { From 45d763fe503e6e0f180f873b750aea307e73fdcf Mon Sep 17 00:00:00 2001 From: Paul Handrigan Date: Fri, 26 Jul 2024 10:11:11 -0500 Subject: [PATCH 0173/1052] ASoC: cs530x: Change IN HPF Select kcontrol name Change to the IN HPF Select kcontrol to the correct name IN DEC Filter Select. Signed-off-by: Paul Handrigan Link: https://patch.msgid.link/20240726151111.3247774-1-paulha@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs530x.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/cs530x.c b/sound/soc/codecs/cs530x.c index 25a86a32e936..da52afe56c3c 100644 --- a/sound/soc/codecs/cs530x.c +++ b/sound/soc/codecs/cs530x.c @@ -129,16 +129,16 @@ static int cs530x_put_volsw_vu(struct snd_kcontrol *kcontrol, static const DECLARE_TLV_DB_SCALE(in_vol_tlv, -1270, 50, 0); -static const char * const cs530x_in_hpf_text[] = { +static const char * const cs530x_in_filter_text[] = { "Min Phase Slow Roll-off", "Min Phase Fast Roll-off", "Linear Phase Slow Roll-off", "Linear Phase Fast Roll-off", }; -static SOC_ENUM_SINGLE_DECL(cs530x_in_hpf_enum, CS530X_IN_FILTER, +static SOC_ENUM_SINGLE_DECL(cs530x_in_filter_enum, CS530X_IN_FILTER, CS530X_IN_FILTER_SHIFT, - cs530x_in_hpf_text); + cs530x_in_filter_text); static const char * const cs530x_in_4ch_sum_text[] = { "None", @@ -189,7 +189,7 @@ SOC_SINGLE_EXT_TLV("IN1 Volume", CS530X_IN_VOL_CTRL1_0, 0, 255, 1, SOC_SINGLE_EXT_TLV("IN2 Volume", CS530X_IN_VOL_CTRL1_1, 0, 255, 1, snd_soc_get_volsw, cs530x_put_volsw_vu, in_vol_tlv), -SOC_ENUM("IN HPF Select", cs530x_in_hpf_enum), +SOC_ENUM("IN DEC Filter Select", cs530x_in_filter_enum), SOC_ENUM("Input Ramp Up", cs530x_ramp_inc_enum), SOC_ENUM("Input Ramp Down", cs530x_ramp_dec_enum), From 0f7ced7d620ecc7a986843d6aeec41cce3116f41 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 29 Jul 2024 11:55:04 +0100 Subject: [PATCH 0174/1052] x86/aperfmperf: Fix deadlock on cpu_hotplug_lock The broken patch results in a call to init_freq_invariance_cppc() in a CPU hotplug handler in both the path for initially present CPUs and those hotplugged later. That function includes a one time call to amd_set_max_freq_ratio() which in turn calls freq_invariance_enable() that has a static_branch_enable() which takes the cpu_hotlug_lock which is already held. Avoid the deadlock by using static_branch_enable_cpuslocked() as the lock will always be already held. The equivalent path on Intel does not already hold this lock, so take it around the call to freq_invariance_enable(), which results in it being held over the call to register_syscall_ops, which looks to be safe to do. Fixes: c1385c1f0ba3 ("ACPI: processor: Simplify initial onlining to use same path for cold and hotplug") Closes: https://lore.kernel.org/all/CABXGCsPvqBfL5hQDOARwfqasLRJ_eNPBbCngZ257HOe=xbWDkA@mail.gmail.com/ Reported-by: Mikhail Gavrilov Suggested-by: Thomas Gleixner Signed-off-by: Jonathan Cameron Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Tested-by: Mikhail Gavrilov Tested-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240729105504.2170-1-Jonathan.Cameron@huawei.com --- arch/x86/kernel/cpu/aperfmperf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index b3fa61d45352..0b69bfbf345d 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -306,7 +306,7 @@ static void freq_invariance_enable(void) WARN_ON_ONCE(1); return; } - static_branch_enable(&arch_scale_freq_key); + static_branch_enable_cpuslocked(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } @@ -323,8 +323,10 @@ static void __init bp_init_freq_invariance(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; - if (intel_set_max_freq_ratio()) + if (intel_set_max_freq_ratio()) { + guard(cpus_read_lock)(); freq_invariance_enable(); + } } static void disable_freq_invariance_workfn(struct work_struct *work) From f872d4af79fe8c71ae291ce8875b477e1669a6c7 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Mon, 29 Jul 2024 16:18:50 +0300 Subject: [PATCH 0175/1052] irqchip/meson-gpio: Convert meson_gpio_irq_controller::lock to 'raw_spinlock_t' This lock is acquired under irq_desc::lock with interrupts disabled. When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible, which results in invalid lock acquire context; [ BUG: Invalid wait context ] swapper/0/1 is trying to lock: ffff0000008fed30 (&ctl->lock){....}-{3:3}, at: meson_gpio_irq_update_bits0 other info that might help us debug this: context-{5:5} 3 locks held by swapper/0/1: #0: ffff0000003cd0f8 (&dev->mutex){....}-{4:4}, at: __driver_attach+0x90c #1: ffff000004714650 (&desc->request_mutex){+.+.}-{4:4}, at: __setup_irq0 #2: ffff0000047144c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __setup_irq0 stack backtrace: CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.9.9-sdkernel #1 Call trace: _raw_spin_lock_irqsave+0x60/0x88 meson_gpio_irq_update_bits+0x34/0x70 meson8_gpio_irq_set_type+0x78/0xc4 meson_gpio_irq_set_type+0x30/0x60 __irq_set_trigger+0x60/0x180 __setup_irq+0x30c/0x6e0 request_threaded_irq+0xec/0x1a4 Fixes: 215f4cc0fb20 ("irqchip/meson: Add support for gpio interrupt controller") Signed-off-by: Arseniy Krasnov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240729131850.3015508-1-avkrasnov@salutedevices.com --- drivers/irqchip/irq-meson-gpio.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index 27e30ce41db3..cd789fa51519 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -178,7 +178,7 @@ struct meson_gpio_irq_controller { void __iomem *base; u32 channel_irqs[MAX_NUM_CHANNEL]; DECLARE_BITMAP(channel_map, MAX_NUM_CHANNEL); - spinlock_t lock; + raw_spinlock_t lock; }; static void meson_gpio_irq_update_bits(struct meson_gpio_irq_controller *ctl, @@ -187,14 +187,14 @@ static void meson_gpio_irq_update_bits(struct meson_gpio_irq_controller *ctl, unsigned long flags; u32 tmp; - spin_lock_irqsave(&ctl->lock, flags); + raw_spin_lock_irqsave(&ctl->lock, flags); tmp = readl_relaxed(ctl->base + reg); tmp &= ~mask; tmp |= val; writel_relaxed(tmp, ctl->base + reg); - spin_unlock_irqrestore(&ctl->lock, flags); + raw_spin_unlock_irqrestore(&ctl->lock, flags); } static void meson_gpio_irq_init_dummy(struct meson_gpio_irq_controller *ctl) @@ -244,12 +244,12 @@ meson_gpio_irq_request_channel(struct meson_gpio_irq_controller *ctl, unsigned long flags; unsigned int idx; - spin_lock_irqsave(&ctl->lock, flags); + raw_spin_lock_irqsave(&ctl->lock, flags); /* Find a free channel */ idx = find_first_zero_bit(ctl->channel_map, ctl->params->nr_channels); if (idx >= ctl->params->nr_channels) { - spin_unlock_irqrestore(&ctl->lock, flags); + raw_spin_unlock_irqrestore(&ctl->lock, flags); pr_err("No channel available\n"); return -ENOSPC; } @@ -257,7 +257,7 @@ meson_gpio_irq_request_channel(struct meson_gpio_irq_controller *ctl, /* Mark the channel as used */ set_bit(idx, ctl->channel_map); - spin_unlock_irqrestore(&ctl->lock, flags); + raw_spin_unlock_irqrestore(&ctl->lock, flags); /* * Setup the mux of the channel to route the signal of the pad @@ -567,7 +567,7 @@ static int meson_gpio_irq_of_init(struct device_node *node, struct device_node * if (!ctl) return -ENOMEM; - spin_lock_init(&ctl->lock); + raw_spin_lock_init(&ctl->lock); ctl->base = of_iomap(node, 0); if (!ctl->base) { From 1fd2c10acb7b35d72101a4619ee5b2cddb9efd3a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sat, 27 Jul 2024 12:11:02 +0200 Subject: [PATCH 0176/1052] parisc: fix unaligned accesses in BPF There were spurious unaligned access warnings when calling BPF code. Sometimes, the warnings were triggered with any incoming packet, making the machine hard to use. The reason for the warnings is this: on parisc64, pointers to functions are not really pointers to functions, they are pointers to 16-byte descriptor. The first 8 bytes of the descriptor is a pointer to the function and the next 8 bytes of the descriptor is the content of the "dp" register. This descriptor is generated in the function bpf_jit_build_prologue. The problem is that the function bpf_int_jit_compile advertises 4-byte alignment when calling bpf_jit_binary_alloc, bpf_jit_binary_alloc randomizes the returned array and if the array happens to be not aligned on 8-byte boundary, the descriptor generated in bpf_jit_build_prologue is also not aligned and this triggers the unaligned access warning. Fix this by advertising 8-byte alignment on parisc64 when calling bpf_jit_binary_alloc. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- arch/parisc/net/bpf_jit_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/net/bpf_jit_core.c b/arch/parisc/net/bpf_jit_core.c index 979f45d4d1fb..06cbcd6fe87b 100644 --- a/arch/parisc/net/bpf_jit_core.c +++ b/arch/parisc/net/bpf_jit_core.c @@ -114,7 +114,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) jit_data->header = bpf_jit_binary_alloc(prog_size + extable_size, &jit_data->image, - sizeof(u32), + sizeof(long), bpf_fill_ill_insns); if (!jit_data->header) { prog = orig_prog; From 7ae04ba36b381bffe2471eff3a93edced843240f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sat, 27 Jul 2024 20:22:52 +0200 Subject: [PATCH 0177/1052] parisc: fix a possible DMA corruption ARCH_DMA_MINALIGN was defined as 16 - this is too small - it may be possible that two unrelated 16-byte allocations share a cache line. If one of these allocations is written using DMA and the other is written using cached write, the value that was written with DMA may be corrupted. This commit changes ARCH_DMA_MINALIGN to be 128 on PA20 and 32 on PA1.1 - that's the largest possible cache line size. As different parisc microarchitectures have different cache line size, we define arch_slab_minalign(), cache_line_size() and dma_get_cache_alignment() so that the kernel may tune slab cache parameters dynamically, based on the detected cache line size. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/cache.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 5d650e02cbf4..b0a2ac3ba916 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -20,6 +20,7 @@ config PARISC select ARCH_SUPPORTS_HUGETLBFS if PA20 select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_STACKWALK + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select HAVE_RELIABLE_STACKTRACE select DMA_OPS diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h index 2a60d7a72f1f..a3f0f100f219 100644 --- a/arch/parisc/include/asm/cache.h +++ b/arch/parisc/include/asm/cache.h @@ -20,7 +20,16 @@ #define SMP_CACHE_BYTES L1_CACHE_BYTES -#define ARCH_DMA_MINALIGN L1_CACHE_BYTES +#ifdef CONFIG_PA20 +#define ARCH_DMA_MINALIGN 128 +#else +#define ARCH_DMA_MINALIGN 32 +#endif +#define ARCH_KMALLOC_MINALIGN 16 /* ldcw requires 16-byte alignment */ + +#define arch_slab_minalign() ((unsigned)dcache_stride) +#define cache_line_size() dcache_stride +#define dma_get_cache_alignment cache_line_size #define __read_mostly __section(".data..read_mostly") From 7ec5bd247a0d6fb23ab7da2bedd9c3f1f9333c3b Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Mon, 29 Jul 2024 12:01:33 +0530 Subject: [PATCH 0178/1052] nvme: remove unused parameter First parameter of nvme_init_integrity() is unused. Remove it, and modify the callers. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 053d5b4909cd..e8afb5a0f3a3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1757,7 +1757,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head, +static bool nvme_init_integrity(struct nvme_ns_head *head, struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; @@ -2176,7 +2176,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case. */ - if (!nvme_init_integrity(ns->disk, ns->head, &lim)) + if (!nvme_init_integrity(ns->head, &lim)) capacity = 0; ret = queue_limits_commit_update(ns->disk->queue, &lim); @@ -2280,7 +2280,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) if (unsupported) ns->head->disk->flags |= GENHD_FL_HIDDEN; else - nvme_init_integrity(ns->head->disk, ns->head, &lim); + nvme_init_integrity(ns->head, &lim); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); From ec145a18687fec8dd97eeb4f30057fa4debef577 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Fri, 26 Jul 2024 20:17:09 +0200 Subject: [PATCH 0179/1052] ice: respect netif readiness in AF_XDP ZC related ndo's Address a scenario in which XSK ZC Tx produces descriptors to XDP Tx ring when link is either not yet fully initialized or process of stopping the netdev has already started. To avoid this, add checks against carrier readiness in ice_xsk_wakeup() and in ice_xmit_zc(). One could argue that bailing out early in ice_xsk_wakeup() would be sufficient but given the fact that we produce Tx descriptors on behalf of NAPI that is triggered for Rx traffic, the latter is also needed. Bringing link up is an asynchronous event executed within ice_service_task so even though interface has been brought up there is still a time frame where link is not yet ok. Without this patch, when AF_XDP ZC Tx is used simultaneously with stack Tx, Tx timeouts occur after going through link flap (admin brings interface down then up again). HW seem to be unable to transmit descriptor to the wire after HW tail register bump which in turn causes bit __QUEUE_STATE_STACK_XOFF to be set forever as netdev_tx_completed_queue() sees no cleaned bytes on the input. Fixes: 126cdfe1007a ("ice: xsk: Improve AF_XDP ZC Tx and use batching API") Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Michal Kubiak Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index a65955eb23c0..72738b8b8a68 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -1048,6 +1048,10 @@ bool ice_xmit_zc(struct ice_tx_ring *xdp_ring) ice_clean_xdp_irq_zc(xdp_ring); + if (!netif_carrier_ok(xdp_ring->vsi->netdev) || + !netif_running(xdp_ring->vsi->netdev)) + return true; + budget = ICE_DESC_UNUSED(xdp_ring); budget = min_t(u16, budget, ICE_RING_QUARTER(xdp_ring)); @@ -1091,7 +1095,7 @@ ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, struct ice_vsi *vsi = np->vsi; struct ice_tx_ring *ring; - if (test_bit(ICE_VSI_DOWN, vsi->state)) + if (test_bit(ICE_VSI_DOWN, vsi->state) || !netif_carrier_ok(netdev)) return -ENETDOWN; if (!ice_is_xdp_ena_vsi(vsi)) From 1ff72a2f67791cd4ddad19ed830445f57b30e992 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:10 +0200 Subject: [PATCH 0180/1052] ice: don't busy wait for Rx queue disable in ice_qp_dis() When ice driver is spammed with multiple xdpsock instances and flow control is enabled, there are cases when Rx queue gets stuck and unable to reflect the disable state in QRX_CTRL register. Similar issue has previously been addressed in commit 13a6233b033f ("ice: Add support to enable/disable all Rx queues before waiting"). To workaround this, let us simply not wait for a disabled state as later patch will make sure that regardless of the encountered error in the process of disabling a queue pair, the Rx queue will be enabled. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 72738b8b8a68..3104a5657b83 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -199,10 +199,8 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) if (err) return err; } - err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true); - if (err) - return err; + ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, false); ice_qp_clean_rings(vsi, q_idx); ice_qp_reset_stats(vsi, q_idx); From 405d9999aa0b4ae467ef391d1d9c7e0d30ad0841 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:11 +0200 Subject: [PATCH 0181/1052] ice: replace synchronize_rcu with synchronize_net Given that ice_qp_dis() is called under rtnl_lock, synchronize_net() can be called instead of synchronize_rcu() so that XDP rings can finish its job in a faster way. Also let us do this as earlier in XSK queue disable flow. Additionally, turn off regular Tx queue before disabling irqs and NAPI. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 3104a5657b83..ba50af9a5929 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -52,10 +52,8 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx) static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx) { ice_clean_tx_ring(vsi->tx_rings[q_idx]); - if (ice_is_xdp_ena_vsi(vsi)) { - synchronize_rcu(); + if (ice_is_xdp_ena_vsi(vsi)) ice_clean_tx_ring(vsi->xdp_rings[q_idx]); - } ice_clean_rx_ring(vsi->rx_rings[q_idx]); } @@ -180,11 +178,12 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) usleep_range(1000, 2000); } + synchronize_net(); + netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); + ice_qvec_dis_irq(vsi, rx_ring, q_vector); ice_qvec_toggle_napi(vsi, q_vector, false); - netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); - ice_fill_txq_meta(vsi, tx_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); if (err) From d5922717994911e8f0eab736f3ba0d968c158823 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:12 +0200 Subject: [PATCH 0182/1052] ice: modify error handling when setting XSK pool in ndo_bpf Don't bail out right when spotting an error within ice_qp_{dis,ena}() but rather track error and go through whole flow of disabling and enabling queue pair. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 30 +++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index ba50af9a5929..902096b000f5 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -162,6 +162,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) struct ice_tx_ring *tx_ring; struct ice_rx_ring *rx_ring; int timeout = 50; + int fail = 0; int err; if (q_idx >= vsi->num_rxq || q_idx >= vsi->num_txq) @@ -186,8 +187,8 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) ice_fill_txq_meta(vsi, tx_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); - if (err) - return err; + if (!fail) + fail = err; if (ice_is_xdp_ena_vsi(vsi)) { struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_idx]; @@ -195,15 +196,15 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) ice_fill_txq_meta(vsi, xdp_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, xdp_ring, &txq_meta); - if (err) - return err; + if (!fail) + fail = err; } ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, false); ice_qp_clean_rings(vsi, q_idx); ice_qp_reset_stats(vsi, q_idx); - return 0; + return fail; } /** @@ -216,32 +217,33 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) { struct ice_q_vector *q_vector; + int fail = 0; int err; err = ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx); - if (err) - return err; + if (!fail) + fail = err; if (ice_is_xdp_ena_vsi(vsi)) { struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_idx]; err = ice_vsi_cfg_single_txq(vsi, vsi->xdp_rings, q_idx); - if (err) - return err; + if (!fail) + fail = err; ice_set_ring_xdp(xdp_ring); ice_tx_xsk_pool(vsi, q_idx); } err = ice_vsi_cfg_single_rxq(vsi, q_idx); - if (err) - return err; + if (!fail) + fail = err; q_vector = vsi->rx_rings[q_idx]->q_vector; ice_qvec_cfg_msix(vsi, q_vector); err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_idx, true); - if (err) - return err; + if (!fail) + fail = err; ice_qvec_toggle_napi(vsi, q_vector, true); ice_qvec_ena_irq(vsi, q_vector); @@ -249,7 +251,7 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); clear_bit(ICE_CFG_BUSY, vsi->state); - return 0; + return fail; } /** From 9da75a511c5558fa3da56759984fd1fa859186f0 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:13 +0200 Subject: [PATCH 0183/1052] ice: toggle netif_carrier when setting up XSK pool This so we prevent Tx timeout issues. One of conditions checked on running in the background dev_watchdog() is netif_carrier_ok(), so let us turn it off when we disable the queues that belong to a q_vector where XSK pool is being configured. Turn carrier on in ice_qp_ena() only when ice_get_link_status() tells us that physical link is up. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 902096b000f5..3fbe4cfadfbf 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -180,6 +180,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) } synchronize_net(); + netif_carrier_off(vsi->netdev); netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); ice_qvec_dis_irq(vsi, rx_ring, q_vector); @@ -218,6 +219,7 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) { struct ice_q_vector *q_vector; int fail = 0; + bool link_up; int err; err = ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx); @@ -248,7 +250,11 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) ice_qvec_toggle_napi(vsi, q_vector, true); ice_qvec_ena_irq(vsi, q_vector); - netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); + ice_get_link_status(vsi->port_info, &link_up); + if (link_up) { + netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); + netif_carrier_on(vsi->netdev); + } clear_bit(ICE_CFG_BUSY, vsi->state); return fail; From ebc33a3f8d0aeddf19fd5827add24b82ae171829 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:14 +0200 Subject: [PATCH 0184/1052] ice: improve updating ice_{t,r}x_ring::xsk_pool xsk_buff_pool pointers that ice ring structs hold are updated via ndo_bpf that is executed in process context while it can be read by remote CPU at the same time within NAPI poll. Use synchronize_net() after pointer update and {READ,WRITE}_ONCE() when working with mentioned pointer. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 11 ++- drivers/net/ethernet/intel/ice/ice_base.c | 4 +- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- drivers/net/ethernet/intel/ice/ice_txrx.c | 8 +- drivers/net/ethernet/intel/ice/ice_xsk.c | 103 ++++++++++++++-------- drivers/net/ethernet/intel/ice/ice_xsk.h | 14 ++- 6 files changed, 87 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 99a75a59078e..caaa10157909 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -765,18 +765,17 @@ static inline struct xsk_buff_pool *ice_get_xp_from_qid(struct ice_vsi *vsi, } /** - * ice_xsk_pool - get XSK buffer pool bound to a ring + * ice_rx_xsk_pool - assign XSK buff pool to Rx ring * @ring: Rx ring to use * - * Returns a pointer to xsk_buff_pool structure if there is a buffer pool - * present, NULL otherwise. + * Sets XSK buff pool pointer on Rx ring. */ -static inline struct xsk_buff_pool *ice_xsk_pool(struct ice_rx_ring *ring) +static inline void ice_rx_xsk_pool(struct ice_rx_ring *ring) { struct ice_vsi *vsi = ring->vsi; u16 qid = ring->q_index; - return ice_get_xp_from_qid(vsi, qid); + WRITE_ONCE(ring->xsk_pool, ice_get_xp_from_qid(vsi, qid)); } /** @@ -801,7 +800,7 @@ static inline void ice_tx_xsk_pool(struct ice_vsi *vsi, u16 qid) if (!ring) return; - ring->xsk_pool = ice_get_xp_from_qid(vsi, qid); + WRITE_ONCE(ring->xsk_pool, ice_get_xp_from_qid(vsi, qid)); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 5d396c1a7731..1facf179a96f 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -536,7 +536,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) return err; } - ring->xsk_pool = ice_xsk_pool(ring); + ice_rx_xsk_pool(ring); if (ring->xsk_pool) { xdp_rxq_info_unreg(&ring->xdp_rxq); @@ -597,7 +597,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) return 0; } - ok = ice_alloc_rx_bufs_zc(ring, num_bufs); + ok = ice_alloc_rx_bufs_zc(ring, ring->xsk_pool, num_bufs); if (!ok) { u16 pf_q = ring->vsi->rxq_map[ring->q_index]; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index ec636be4d17d..3de020020bc4 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2948,7 +2948,7 @@ static void ice_vsi_rx_napi_schedule(struct ice_vsi *vsi) ice_for_each_rxq(vsi, i) { struct ice_rx_ring *rx_ring = vsi->rx_rings[i]; - if (rx_ring->xsk_pool) + if (READ_ONCE(rx_ring->xsk_pool)) napi_schedule(&rx_ring->q_vector->napi); } } diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 8bb743f78fcb..0f91e9167427 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1521,10 +1521,11 @@ int ice_napi_poll(struct napi_struct *napi, int budget) * budget and be more aggressive about cleaning up the Tx descriptors. */ ice_for_each_tx_ring(tx_ring, q_vector->tx) { + struct xsk_buff_pool *xsk_pool = READ_ONCE(tx_ring->xsk_pool); bool wd; - if (tx_ring->xsk_pool) - wd = ice_xmit_zc(tx_ring); + if (xsk_pool) + wd = ice_xmit_zc(tx_ring, xsk_pool); else if (ice_ring_is_xdp(tx_ring)) wd = true; else @@ -1550,6 +1551,7 @@ int ice_napi_poll(struct napi_struct *napi, int budget) budget_per_ring = budget; ice_for_each_rx_ring(rx_ring, q_vector->rx) { + struct xsk_buff_pool *xsk_pool = READ_ONCE(rx_ring->xsk_pool); int cleaned; /* A dedicated path for zero-copy allows making a single @@ -1557,7 +1559,7 @@ int ice_napi_poll(struct napi_struct *napi, int budget) * ice_clean_rx_irq function and makes the codebase cleaner. */ cleaned = rx_ring->xsk_pool ? - ice_clean_rx_irq_zc(rx_ring, budget_per_ring) : + ice_clean_rx_irq_zc(rx_ring, xsk_pool, budget_per_ring) : ice_clean_rx_irq(rx_ring, budget_per_ring); work_done += cleaned; /* if we clean as many as budgeted, we must not be done */ diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 3fbe4cfadfbf..ee084ad80a61 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -250,6 +250,8 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) ice_qvec_toggle_napi(vsi, q_vector, true); ice_qvec_ena_irq(vsi, q_vector); + /* make sure NAPI sees updated ice_{t,x}_ring::xsk_pool */ + synchronize_net(); ice_get_link_status(vsi->port_info, &link_up); if (link_up) { netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); @@ -464,6 +466,7 @@ static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, /** * __ice_alloc_rx_bufs_zc - allocate a number of Rx buffers * @rx_ring: Rx ring + * @xsk_pool: XSK buffer pool to pick buffers to be filled by HW * @count: The number of buffers to allocate * * Place the @count of descriptors onto Rx ring. Handle the ring wrap @@ -472,7 +475,8 @@ static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, * * Returns true if all allocations were successful, false if any fail. */ -static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) +static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count) { u32 nb_buffs_extra = 0, nb_buffs = 0; union ice_32b_rx_flex_desc *rx_desc; @@ -484,8 +488,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) xdp = ice_xdp_buf(rx_ring, ntu); if (ntu + count >= rx_ring->count) { - nb_buffs_extra = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, - rx_desc, + nb_buffs_extra = ice_fill_rx_descs(xsk_pool, xdp, rx_desc, rx_ring->count - ntu); if (nb_buffs_extra != rx_ring->count - ntu) { ntu += nb_buffs_extra; @@ -498,7 +501,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) ice_release_rx_desc(rx_ring, 0); } - nb_buffs = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, rx_desc, count); + nb_buffs = ice_fill_rx_descs(xsk_pool, xdp, rx_desc, count); ntu += nb_buffs; if (ntu == rx_ring->count) @@ -514,6 +517,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) /** * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers * @rx_ring: Rx ring + * @xsk_pool: XSK buffer pool to pick buffers to be filled by HW * @count: The number of buffers to allocate * * Wrapper for internal allocation routine; figure out how many tail @@ -521,7 +525,8 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) * * Returns true if all calls to internal alloc routine succeeded */ -bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) +bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count) { u16 rx_thresh = ICE_RING_QUARTER(rx_ring); u16 leftover, i, tail_bumps; @@ -530,9 +535,9 @@ bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) leftover = count - (tail_bumps * rx_thresh); for (i = 0; i < tail_bumps; i++) - if (!__ice_alloc_rx_bufs_zc(rx_ring, rx_thresh)) + if (!__ice_alloc_rx_bufs_zc(rx_ring, xsk_pool, rx_thresh)) return false; - return __ice_alloc_rx_bufs_zc(rx_ring, leftover); + return __ice_alloc_rx_bufs_zc(rx_ring, xsk_pool, leftover); } /** @@ -601,8 +606,10 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) /** * ice_clean_xdp_irq_zc - produce AF_XDP descriptors to CQ * @xdp_ring: XDP Tx ring + * @xsk_pool: AF_XDP buffer pool pointer */ -static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) +static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool) { u16 ntc = xdp_ring->next_to_clean; struct ice_tx_desc *tx_desc; @@ -653,7 +660,7 @@ static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) if (xdp_ring->next_to_clean >= cnt) xdp_ring->next_to_clean -= cnt; if (xsk_frames) - xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames); + xsk_tx_completed(xsk_pool, xsk_frames); return completed_frames; } @@ -662,6 +669,7 @@ static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) * ice_xmit_xdp_tx_zc - AF_XDP ZC handler for XDP_TX * @xdp: XDP buffer to xmit * @xdp_ring: XDP ring to produce descriptor onto + * @xsk_pool: AF_XDP buffer pool pointer * * note that this function works directly on xdp_buff, no need to convert * it to xdp_frame. xdp_buff pointer is stored to ice_tx_buf so that cleaning @@ -671,7 +679,8 @@ static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) * was not enough space on XDP ring */ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, - struct ice_tx_ring *xdp_ring) + struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool) { struct skb_shared_info *sinfo = NULL; u32 size = xdp->data_end - xdp->data; @@ -685,7 +694,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, free_space = ICE_DESC_UNUSED(xdp_ring); if (free_space < ICE_RING_QUARTER(xdp_ring)) - free_space += ice_clean_xdp_irq_zc(xdp_ring); + free_space += ice_clean_xdp_irq_zc(xdp_ring, xsk_pool); if (unlikely(!free_space)) goto busy; @@ -705,7 +714,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, dma_addr_t dma; dma = xsk_buff_xdp_get_dma(xdp); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, size); tx_buf->xdp = xdp; tx_buf->type = ICE_TX_BUF_XSK_TX; @@ -747,12 +756,14 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, * @xdp: xdp_buff used as input to the XDP program * @xdp_prog: XDP program to run * @xdp_ring: ring to be used for XDP_TX action + * @xsk_pool: AF_XDP buffer pool pointer * * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR} */ static int ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring) + struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool) { int err, result = ICE_XDP_PASS; u32 act; @@ -763,7 +774,7 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (!err) return ICE_XDP_REDIR; - if (xsk_uses_need_wakeup(rx_ring->xsk_pool) && err == -ENOBUFS) + if (xsk_uses_need_wakeup(xsk_pool) && err == -ENOBUFS) result = ICE_XDP_EXIT; else result = ICE_XDP_CONSUMED; @@ -774,7 +785,7 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, case XDP_PASS: break; case XDP_TX: - result = ice_xmit_xdp_tx_zc(xdp, xdp_ring); + result = ice_xmit_xdp_tx_zc(xdp, xdp_ring, xsk_pool); if (result == ICE_XDP_CONSUMED) goto out_failure; break; @@ -826,14 +837,16 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, /** * ice_clean_rx_irq_zc - consumes packets from the hardware ring * @rx_ring: AF_XDP Rx ring + * @xsk_pool: AF_XDP buffer pool pointer * @budget: NAPI budget * * Returns number of processed packets on success, remaining budget on failure. */ -int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) +int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, + int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; - struct xsk_buff_pool *xsk_pool = rx_ring->xsk_pool; u32 ntc = rx_ring->next_to_clean; u32 ntu = rx_ring->next_to_use; struct xdp_buff *first = NULL; @@ -896,7 +909,8 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) if (ice_is_non_eop(rx_ring, rx_desc)) continue; - xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring); + xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring, + xsk_pool); if (likely(xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))) { xdp_xmit |= xdp_res; } else if (xdp_res == ICE_XDP_EXIT) { @@ -945,7 +959,8 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) rx_ring->next_to_clean = ntc; entries_to_alloc = ICE_RX_DESC_UNUSED(rx_ring); if (entries_to_alloc > ICE_RING_QUARTER(rx_ring)) - failure |= !ice_alloc_rx_bufs_zc(rx_ring, entries_to_alloc); + failure |= !ice_alloc_rx_bufs_zc(rx_ring, xsk_pool, + entries_to_alloc); ice_finalize_xdp_rx(xdp_ring, xdp_xmit, 0); ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes); @@ -968,17 +983,19 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) /** * ice_xmit_pkt - produce a single HW Tx descriptor out of AF_XDP descriptor * @xdp_ring: XDP ring to produce the HW Tx descriptor on + * @xsk_pool: XSK buffer pool to pick buffers to be consumed by HW * @desc: AF_XDP descriptor to pull the DMA address and length from * @total_bytes: bytes accumulator that will be used for stats update */ -static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, struct xdp_desc *desc, +static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool, struct xdp_desc *desc, unsigned int *total_bytes) { struct ice_tx_desc *tx_desc; dma_addr_t dma; - dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len); + dma = xsk_buff_raw_get_dma(xsk_pool, desc->addr); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, desc->len); tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_to_use++); tx_desc->buf_addr = cpu_to_le64(dma); @@ -991,10 +1008,13 @@ static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, struct xdp_desc *desc, /** * ice_xmit_pkt_batch - produce a batch of HW Tx descriptors out of AF_XDP descriptors * @xdp_ring: XDP ring to produce the HW Tx descriptors on + * @xsk_pool: XSK buffer pool to pick buffers to be consumed by HW * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from * @total_bytes: bytes accumulator that will be used for stats update */ -static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *descs, +static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool, + struct xdp_desc *descs, unsigned int *total_bytes) { u16 ntu = xdp_ring->next_to_use; @@ -1004,8 +1024,8 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *de loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { dma_addr_t dma; - dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, descs[i].addr); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, descs[i].len); + dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len); tx_desc = ICE_TX_DESC(xdp_ring, ntu++); tx_desc->buf_addr = cpu_to_le64(dma); @@ -1021,37 +1041,41 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *de /** * ice_fill_tx_hw_ring - produce the number of Tx descriptors onto ring * @xdp_ring: XDP ring to produce the HW Tx descriptors on + * @xsk_pool: XSK buffer pool to pick buffers to be consumed by HW * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from * @nb_pkts: count of packets to be send * @total_bytes: bytes accumulator that will be used for stats update */ -static void ice_fill_tx_hw_ring(struct ice_tx_ring *xdp_ring, struct xdp_desc *descs, - u32 nb_pkts, unsigned int *total_bytes) +static void ice_fill_tx_hw_ring(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool, + struct xdp_desc *descs, u32 nb_pkts, + unsigned int *total_bytes) { u32 batched, leftover, i; batched = ALIGN_DOWN(nb_pkts, PKTS_PER_BATCH); leftover = nb_pkts & (PKTS_PER_BATCH - 1); for (i = 0; i < batched; i += PKTS_PER_BATCH) - ice_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes); + ice_xmit_pkt_batch(xdp_ring, xsk_pool, &descs[i], total_bytes); for (; i < batched + leftover; i++) - ice_xmit_pkt(xdp_ring, &descs[i], total_bytes); + ice_xmit_pkt(xdp_ring, xsk_pool, &descs[i], total_bytes); } /** * ice_xmit_zc - take entries from XSK Tx ring and place them onto HW Tx ring * @xdp_ring: XDP ring to produce the HW Tx descriptors on + * @xsk_pool: AF_XDP buffer pool pointer * * Returns true if there is no more work that needs to be done, false otherwise */ -bool ice_xmit_zc(struct ice_tx_ring *xdp_ring) +bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, struct xsk_buff_pool *xsk_pool) { - struct xdp_desc *descs = xdp_ring->xsk_pool->tx_descs; + struct xdp_desc *descs = xsk_pool->tx_descs; u32 nb_pkts, nb_processed = 0; unsigned int total_bytes = 0; int budget; - ice_clean_xdp_irq_zc(xdp_ring); + ice_clean_xdp_irq_zc(xdp_ring, xsk_pool); if (!netif_carrier_ok(xdp_ring->vsi->netdev) || !netif_running(xdp_ring->vsi->netdev)) @@ -1060,25 +1084,26 @@ bool ice_xmit_zc(struct ice_tx_ring *xdp_ring) budget = ICE_DESC_UNUSED(xdp_ring); budget = min_t(u16, budget, ICE_RING_QUARTER(xdp_ring)); - nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, budget); + nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget); if (!nb_pkts) return true; if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) { nb_processed = xdp_ring->count - xdp_ring->next_to_use; - ice_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes); + ice_fill_tx_hw_ring(xdp_ring, xsk_pool, descs, nb_processed, + &total_bytes); xdp_ring->next_to_use = 0; } - ice_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed, - &total_bytes); + ice_fill_tx_hw_ring(xdp_ring, xsk_pool, &descs[nb_processed], + nb_pkts - nb_processed, &total_bytes); ice_set_rs_bit(xdp_ring); ice_xdp_ring_update_tail(xdp_ring); ice_update_tx_ring_stats(xdp_ring, nb_pkts, total_bytes); - if (xsk_uses_need_wakeup(xdp_ring->xsk_pool)) - xsk_set_tx_need_wakeup(xdp_ring->xsk_pool); + if (xsk_uses_need_wakeup(xsk_pool)) + xsk_set_tx_need_wakeup(xsk_pool); return nb_pkts < budget; } @@ -1111,7 +1136,7 @@ ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, ring = vsi->rx_rings[queue_id]->xdp_ring; - if (!ring->xsk_pool) + if (!READ_ONCE(ring->xsk_pool)) return -EINVAL; /* The idea here is that if NAPI is running, mark a miss, so diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h index 6fa181f080ef..45adeb513253 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.h +++ b/drivers/net/ethernet/intel/ice/ice_xsk.h @@ -20,16 +20,20 @@ struct ice_vsi; #ifdef CONFIG_XDP_SOCKETS int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid); -int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget); +int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, + int budget); int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags); -bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count); +bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count); bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi); void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring); void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring); -bool ice_xmit_zc(struct ice_tx_ring *xdp_ring); +bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, struct xsk_buff_pool *xsk_pool); int ice_realloc_zc_buf(struct ice_vsi *vsi, bool zc); #else -static inline bool ice_xmit_zc(struct ice_tx_ring __always_unused *xdp_ring) +static inline bool ice_xmit_zc(struct ice_tx_ring __always_unused *xdp_ring, + struct xsk_buff_pool __always_unused *xsk_pool) { return false; } @@ -44,6 +48,7 @@ ice_xsk_pool_setup(struct ice_vsi __always_unused *vsi, static inline int ice_clean_rx_irq_zc(struct ice_rx_ring __always_unused *rx_ring, + struct xsk_buff_pool __always_unused *xsk_pool, int __always_unused budget) { return 0; @@ -51,6 +56,7 @@ ice_clean_rx_irq_zc(struct ice_rx_ring __always_unused *rx_ring, static inline bool ice_alloc_rx_bufs_zc(struct ice_rx_ring __always_unused *rx_ring, + struct xsk_buff_pool __always_unused *xsk_pool, u16 __always_unused count) { return false; From 6044ca26210ba72b3dcc649fae1cbedd9e6ab018 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:15 +0200 Subject: [PATCH 0185/1052] ice: add missing WRITE_ONCE when clearing ice_rx_ring::xdp_prog It is read by data path and modified from process context on remote cpu so it is needed to use WRITE_ONCE to clear the pointer. Fixes: efc2214b6047 ("ice: Add support for XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 0f91e9167427..8d25b6981269 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -456,7 +456,7 @@ void ice_free_rx_ring(struct ice_rx_ring *rx_ring) if (rx_ring->vsi->type == ICE_VSI_PF) if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) xdp_rxq_info_unreg(&rx_ring->xdp_rxq); - rx_ring->xdp_prog = NULL; + WRITE_ONCE(rx_ring->xdp_prog, NULL); if (rx_ring->xsk_pool) { kfree(rx_ring->xdp_buf); rx_ring->xdp_buf = NULL; From 963fb4612295a5c35b1b89c8bff3bdd4f9127af6 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:16 +0200 Subject: [PATCH 0186/1052] ice: xsk: fix txq interrupt mapping ice_cfg_txq_interrupt() internally handles XDP Tx ring. Do not use ice_for_each_tx_ring() in ice_qvec_cfg_msix() as this causing us to treat XDP ring that belongs to queue vector as Tx ring and therefore misconfiguring the interrupts. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index ee084ad80a61..240a7bec242b 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -110,25 +110,29 @@ ice_qvec_dis_irq(struct ice_vsi *vsi, struct ice_rx_ring *rx_ring, * ice_qvec_cfg_msix - Enable IRQ for given queue vector * @vsi: the VSI that contains queue vector * @q_vector: queue vector + * @qid: queue index */ static void -ice_qvec_cfg_msix(struct ice_vsi *vsi, struct ice_q_vector *q_vector) +ice_qvec_cfg_msix(struct ice_vsi *vsi, struct ice_q_vector *q_vector, u16 qid) { u16 reg_idx = q_vector->reg_idx; struct ice_pf *pf = vsi->back; struct ice_hw *hw = &pf->hw; - struct ice_tx_ring *tx_ring; - struct ice_rx_ring *rx_ring; + int q, _qid = qid; ice_cfg_itr(hw, q_vector); - ice_for_each_tx_ring(tx_ring, q_vector->tx) - ice_cfg_txq_interrupt(vsi, tx_ring->reg_idx, reg_idx, - q_vector->tx.itr_idx); + for (q = 0; q < q_vector->num_ring_tx; q++) { + ice_cfg_txq_interrupt(vsi, _qid, reg_idx, q_vector->tx.itr_idx); + _qid++; + } - ice_for_each_rx_ring(rx_ring, q_vector->rx) - ice_cfg_rxq_interrupt(vsi, rx_ring->reg_idx, reg_idx, - q_vector->rx.itr_idx); + _qid = qid; + + for (q = 0; q < q_vector->num_ring_rx; q++) { + ice_cfg_rxq_interrupt(vsi, _qid, reg_idx, q_vector->rx.itr_idx); + _qid++; + } ice_flush(hw); } @@ -241,7 +245,7 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) fail = err; q_vector = vsi->rx_rings[q_idx]->q_vector; - ice_qvec_cfg_msix(vsi, q_vector); + ice_qvec_cfg_msix(vsi, q_vector, q_idx); err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_idx, true); if (!fail) From 478574370bef7951fbd9ef5155537d6cbed49472 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 22 Jul 2024 16:49:45 -0700 Subject: [PATCH 0187/1052] btrfs: make cow_file_range_inline() honor locked_page on error The btrfs buffered write path runs through __extent_writepage() which has some tricky return value handling for writepage_delalloc(). Specifically, when that returns 1, we exit, but for other return values we continue and end up calling btrfs_folio_end_all_writers(). If the folio has been unlocked (note that we check the PageLocked bit at the start of __extent_writepage()), this results in an assert panic like this one from syzbot: BTRFS: error (device loop0 state EAL) in free_log_tree:3267: errno=-5 IO failure BTRFS warning (device loop0 state EAL): Skipping commit of aborted transaction. BTRFS: error (device loop0 state EAL) in cleanup_transaction:2018: errno=-5 IO failure assertion failed: folio_test_locked(folio), in fs/btrfs/subpage.c:871 ------------[ cut here ]------------ kernel BUG at fs/btrfs/subpage.c:871! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 PID: 5090 Comm: syz-executor225 Not tainted 6.10.0-syzkaller-05505-gb1bc554e009e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024 RIP: 0010:btrfs_folio_end_all_writers+0x55b/0x610 fs/btrfs/subpage.c:871 Code: e9 d3 fb ff ff e8 25 22 c2 fd 48 c7 c7 c0 3c 0e 8c 48 c7 c6 80 3d 0e 8c 48 c7 c2 60 3c 0e 8c b9 67 03 00 00 e8 66 47 ad 07 90 <0f> 0b e8 6e 45 b0 07 4c 89 ff be 08 00 00 00 e8 21 12 25 fe 4c 89 RSP: 0018:ffffc900033d72e0 EFLAGS: 00010246 RAX: 0000000000000045 RBX: 00fff0000000402c RCX: 663b7a08c50a0a00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffffc900033d73b0 R08: ffffffff8176b98c R09: 1ffff9200067adfc R10: dffffc0000000000 R11: fffff5200067adfd R12: 0000000000000001 R13: dffffc0000000000 R14: 0000000000000000 R15: ffffea0001cbee80 FS: 0000000000000000(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f5f076012f8 CR3: 000000000e134000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __extent_writepage fs/btrfs/extent_io.c:1597 [inline] extent_write_cache_pages fs/btrfs/extent_io.c:2251 [inline] btrfs_writepages+0x14d7/0x2760 fs/btrfs/extent_io.c:2373 do_writepages+0x359/0x870 mm/page-writeback.c:2656 filemap_fdatawrite_wbc+0x125/0x180 mm/filemap.c:397 __filemap_fdatawrite_range mm/filemap.c:430 [inline] __filemap_fdatawrite mm/filemap.c:436 [inline] filemap_flush+0xdf/0x130 mm/filemap.c:463 btrfs_release_file+0x117/0x130 fs/btrfs/file.c:1547 __fput+0x24a/0x8a0 fs/file_table.c:422 task_work_run+0x24f/0x310 kernel/task_work.c:222 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0xa2f/0x27f0 kernel/exit.c:877 do_group_exit+0x207/0x2c0 kernel/exit.c:1026 __do_sys_exit_group kernel/exit.c:1037 [inline] __se_sys_exit_group kernel/exit.c:1035 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1035 x64_sys_call+0x2634/0x2640 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f075b70c9 Code: Unable to access opcode bytes at 0x7f5f075b709f. I was hitting the same issue by doing hundreds of accelerated runs of generic/475, which also hits IO errors by design. I instrumented that reproducer with bpftrace and found that the undesirable folio_unlock was coming from the following callstack: folio_unlock+5 __process_pages_contig+475 cow_file_range_inline.constprop.0+230 cow_file_range+803 btrfs_run_delalloc_range+566 writepage_delalloc+332 __extent_writepage # inlined in my stacktrace, but I added it here extent_write_cache_pages+622 Looking at the bisected-to patch in the syzbot report, Josef realized that the logic of the cow_file_range_inline error path subtly changing. In the past, on error, it jumped to out_unlock in cow_file_range(), which honors the locked_page, so when we ultimately call folio_end_all_writers(), the folio of interest is still locked. After the change, we always unlocked ignoring the locked_page, on both success and error. On the success path, this all results in returning 1 to __extent_writepage(), which skips the folio_end_all_writers() call, which makes it OK to have unlocked. Fix the bug by wiring the locked_page into cow_file_range_inline() and only setting locked_page to NULL on success. Reported-by: syzbot+a14d8ac9af3a2a4fd0c8@syzkaller.appspotmail.com Fixes: 0586d0a89e77 ("btrfs: move extent bit and page cleanup into cow_file_range_inline") CC: stable@vger.kernel.org # 6.10+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8f38eefc8acd..8ca3878348ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -714,8 +714,9 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offse return ret; } -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset, - u64 end, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, + struct page *locked_page, + u64 offset, u64 end, size_t compressed_size, int compress_type, struct folio *compressed_folio, @@ -739,7 +740,10 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset, return ret; } - extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached, + if (ret == 0) + locked_page = NULL; + + extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); @@ -1043,10 +1047,10 @@ static void compress_file_range(struct btrfs_work *work) * extent for the subpage case. */ if (total_in < actual_end) - ret = cow_file_range_inline(inode, start, end, 0, + ret = cow_file_range_inline(inode, NULL, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); else - ret = cow_file_range_inline(inode, start, end, total_compressed, + ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, compress_type, folios[0], false); if (ret <= 0) { if (ret < 0) @@ -1359,7 +1363,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, start, end, 0, + ret = cow_file_range_inline(inode, locked_page, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* From d89c285d28491d8f10534c262ac9e6bdcbe1b4d2 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 11 Jul 2024 23:50:58 +0900 Subject: [PATCH 0188/1052] btrfs: do not subtract delalloc from avail bytes The block group's avail bytes printed when dumping a space info subtract the delalloc_bytes. However, as shown in btrfs_add_reserved_bytes() and btrfs_free_reserved_bytes(), it is added or subtracted along with "reserved" for the delalloc case, which means the "delalloc_bytes" is a part of the "reserved" bytes. So, excluding it to calculate the avail space counts delalloc_bytes twice, which can lead to an invalid result. Fixes: e50b122b832b ("btrfs: print available space for a block group when dumping a space info") CC: stable@vger.kernel.org # 6.6+ Signed-off-by: Naohiro Aota Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9ac94d3119e8..c1d9d3664400 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -583,8 +583,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, spin_lock(&cache->lock); avail = cache->length - cache->used - cache->pinned - - cache->reserved - cache->delalloc_bytes - - cache->bytes_super - cache->zone_unusable; + cache->reserved - cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, From 8cd44dd1d17a23d5cc8c443c659ca57aa76e2fa5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 15 Feb 2023 09:18:02 +0900 Subject: [PATCH 0189/1052] btrfs: zoned: fix zone_unusable accounting on making block group read-write again When btrfs makes a block group read-only, it adds all free regions in the block group to space_info->bytes_readonly. That free space excludes reserved and pinned regions. OTOH, when btrfs makes the block group read-write again, it moves all the unused regions into the block group's zone_unusable. That unused region includes reserved and pinned regions. As a result, it counts too much zone_unusable bytes. Fortunately (or unfortunately), having erroneous zone_unusable does not affect the calculation of space_info->bytes_readonly, because free space (num_bytes in btrfs_dec_block_group_ro) calculation is done based on the erroneous zone_unusable and it reduces the num_bytes just to cancel the error. This behavior can be easily discovered by adding a WARN_ON to check e.g, "bg->pinned > 0" in btrfs_dec_block_group_ro(), and running fstests test case like btrfs/282. Fix it by properly considering pinned and reserved in btrfs_dec_block_group_ro(). Also, add a WARN_ON and introduce btrfs_space_info_update_bytes_zone_unusable() to catch a similar mistake. Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones") CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Naohiro Aota Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 ++++++++----- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/free-space-cache.c | 4 +++- fs/btrfs/space-info.c | 2 +- fs/btrfs/space-info.h | 1 + include/trace/events/btrfs.h | 8 ++++++++ 6 files changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 498442d0c216..2e49d978f504 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1223,8 +1223,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); - block_group->space_info->bytes_zone_unusable -= - block_group->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, + -block_group->zone_unusable); block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1396,7 +1396,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; - sinfo->bytes_zone_unusable -= cache->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, + -cache->zone_unusable); cache->zone_unusable = 0; } cache->ro++; @@ -3056,9 +3057,11 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ cache->zone_unusable = - (cache->alloc_offset - cache->used) + + (cache->alloc_offset - cache->used - cache->pinned - + cache->reserved) + (cache->length - cache->zone_capacity); - sinfo->bytes_zone_unusable += cache->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, + cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d77498e7671c..ff9f0d41987e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2793,7 +2793,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - space_info->bytes_zone_unusable += len; + btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, + len); readonly = true; } spin_unlock(&cache->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3f9b7507543a..f5996a43db24 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2723,8 +2723,10 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, * If the block group is read-only, we should account freed space into * bytes_readonly. */ - if (!block_group->ro) + if (!block_group->ro) { block_group->zone_unusable += to_unusable; + WARN_ON(block_group->zone_unusable > block_group->length); + } spin_unlock(&ctl->tree_lock); if (!used) { spin_lock(&block_group->lock); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c1d9d3664400..68e14fd48638 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -316,7 +316,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; found->bytes_readonly += block_group->bytes_super; - found->bytes_zone_unusable += block_group->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 4db8a0267c16..88b44221ce97 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -249,6 +249,7 @@ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); +DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index eeb56975bee7..de55a555d95b 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2383,6 +2383,14 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, TP_ARGS(fs_info, sinfo, old, diff) ); +DEFINE_EVENT(btrfs__space_info_update, update_bytes_zone_unusable, + + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, u64 old, s64 diff), + + TP_ARGS(fs_info, sinfo, old, diff) +); + DECLARE_EVENT_CLASS(btrfs_raid56_bio, TP_PROTO(const struct btrfs_raid_bio *rbio, From 939b656bc8ab203fdbde26ccac22bcb7f0985be5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 26 Jul 2024 11:12:52 +0100 Subject: [PATCH 0190/1052] btrfs: fix corruption after buffer fault in during direct IO append write During an append (O_APPEND write flag) direct IO write if the input buffer was not previously faulted in, we can corrupt the file in a way that the final size is unexpected and it includes an unexpected hole. The problem happens like this: 1) We have an empty file, with size 0, for example; 2) We do an O_APPEND direct IO with a length of 4096 bytes and the input buffer is not currently faulted in; 3) We enter btrfs_direct_write(), lock the inode and call generic_write_checks(), which calls generic_write_checks_count(), and that function sets the iocb position to 0 with the following code: if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); 4) We call btrfs_dio_write() and enter into iomap, which will end up calling btrfs_dio_iomap_begin() and that calls btrfs_get_blocks_direct_write(), where we update the i_size of the inode to 4096 bytes; 5) After btrfs_dio_iomap_begin() returns, iomap will attempt to access the page of the write input buffer (at iomap_dio_bio_iter(), with a call to bio_iov_iter_get_pages()) and fail with -EFAULT, which gets returned to btrfs at btrfs_direct_write() via btrfs_dio_write(); 6) At btrfs_direct_write() we get the -EFAULT error, unlock the inode, fault in the write buffer and then goto to the label 'relock'; 7) We lock again the inode, do all the necessary checks again and call again generic_write_checks(), which calls generic_write_checks_count() again, and there we set the iocb's position to 4K, which is the current i_size of the inode, with the following code pointed above: if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); 8) Then we go again to btrfs_dio_write() and enter iomap and the write succeeds, but it wrote to the file range [4K, 8K), leaving a hole in the [0, 4K) range and an i_size of 8K, which goes against the expectations of having the data written to the range [0, 4K) and get an i_size of 4K. Fix this by not unlocking the inode before faulting in the input buffer, in case we get -EFAULT or an incomplete write, and not jumping to the 'relock' label after faulting in the buffer - instead jump to a location immediately before calling iomap, skipping all the write checks and relocking. This solves this problem and it's fine even in case the input buffer is memory mapped to the same file range, since only holding the range locked in the inode's io tree can cause a deadlock, it's safe to keep the inode lock (VFS lock), as was fixed and described in commit 51bd9563b678 ("btrfs: fix deadlock due to page faults during direct IO reads and writes"). A sample reproducer provided by a reporter is the following: $ cat test.c #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include int main(int argc, char *argv[]) { if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } int fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT | O_APPEND, 0644); if (fd < 0) { perror("creating test file"); return 1; } char *buf = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ssize_t ret = write(fd, buf, 4096); if (ret < 0) { perror("pwritev2"); return 1; } struct stat stbuf; ret = fstat(fd, &stbuf); if (ret < 0) { perror("stat"); return 1; } printf("size: %llu\n", (unsigned long long)stbuf.st_size); return stbuf.st_size == 4096 ? 0 : 1; } A test case for fstests will be sent soon. Reported-by: Hanna Czenczek Link: https://lore.kernel.org/linux-btrfs/0b841d46-12fe-4e64-9abb-871d8d0de271@redhat.com/ Fixes: 8184620ae212 ("btrfs: fix lost file sync on direct IO write with nowait and dsync iocb") CC: stable@vger.kernel.org # 6.1+ Tested-by: Hanna Czenczek Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/direct-io.c | 38 ++++++++++++++++++++++++++++---------- fs/btrfs/file.c | 17 ++++++++++++++--- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c8568b1a61c4..75fa563e4cac 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -459,6 +459,7 @@ struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; + bool fsync_skip_inode_lock; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index f9fb2db6a1e4..67adbe9d294a 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -856,21 +856,37 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry. */ +again: from->nofault = true; dio = btrfs_dio_write(iocb, from, written); from->nofault = false; - /* - * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync - * iocb, and that needs to lock the inode. So unlock it before calling - * iomap_dio_complete() to avoid a deadlock. - */ - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - - if (IS_ERR_OR_NULL(dio)) + if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); - else + } else { + struct btrfs_file_private stack_private = { 0 }; + struct btrfs_file_private *private; + const bool have_private = (file->private_data != NULL); + + if (!have_private) + file->private_data = &stack_private; + + /* + * If we have a synchronous write, we must make sure the fsync + * triggered by the iomap_dio_complete() call below doesn't + * deadlock on the inode lock - we are already holding it and we + * can't call it after unlocking because we may need to complete + * partial writes due to the input buffer (or parts of it) not + * being already faulted in. + */ + private = file->private_data; + private->fsync_skip_inode_lock = true; ret = iomap_dio_complete(dio); + private->fsync_skip_inode_lock = false; + + if (!have_private) + file->private_data = NULL; + } /* No increment (+=) because iomap returns a cumulative value. */ if (ret > 0) @@ -897,10 +913,12 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) } else { fault_in_iov_iter_readable(from, left); prev_left = left; - goto relock; + goto again; } } + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + /* * If 'ret' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 21381de906f6..9f10a9f23fcc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,6 +1603,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { + struct btrfs_file_private *private = file->private_data; struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; @@ -1612,6 +1613,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; u64 len; bool full_sync; + const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); trace_btrfs_sync_file(file, datasync); @@ -1639,7 +1641,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + down_write(&inode->i_mmap_lock); + else + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -1663,7 +1668,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -1788,7 +1796,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); From 99d3bf5f7377d42f8be60a6b9cb60fb0be34dceb Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 29 Jul 2024 21:51:30 +0900 Subject: [PATCH 0191/1052] Input: MT - limit max slots syzbot is reporting too large allocation at input_mt_init_slots(), for num_slots is supplied from userspace using ioctl(UI_DEV_CREATE). Since nobody knows possible max slots, this patch chose 1024. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=0122fa359a69694395d5 Suggested-by: Dmitry Torokhov Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- drivers/input/input-mt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c index 14b53dac1253..6b04a674f832 100644 --- a/drivers/input/input-mt.c +++ b/drivers/input/input-mt.c @@ -46,6 +46,9 @@ int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots, return 0; if (mt) return mt->num_slots != num_slots ? -EINVAL : 0; + /* Arbitrary limit for avoiding too large memory allocation. */ + if (num_slots > 1024) + return -EINVAL; mt = kzalloc(struct_size(mt, slots, num_slots), GFP_KERNEL); if (!mt) From 7c51f7bbf057f82aeba3390c39ef61b244181c09 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Jul 2024 19:59:57 +0900 Subject: [PATCH 0192/1052] profiling: remove prof_cpu_mask syzbot is reporting uninit-value at profile_hits(), for there is a race window between if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; cpumask_copy(prof_cpu_mask, cpu_possible_mask); in profile_init() and cpumask_available(prof_cpu_mask) && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) in profile_tick(); prof_cpu_mask remains uninitialzed until cpumask_copy() completes while cpumask_available(prof_cpu_mask) returns true as soon as alloc_cpumask_var(&prof_cpu_mask) completes. We could replace alloc_cpumask_var() with zalloc_cpumask_var() and call cpumask_copy() from create_proc_profile() on only UP kernels, for profile_online_cpu() calls cpumask_set_cpu() as needed via cpuhp_setup_state(CPUHP_AP_ONLINE_DYN) on SMP kernels. But this patch removes prof_cpu_mask because it seems unnecessary. The cpumask_test_cpu(smp_processor_id(), prof_cpu_mask) test in profile_tick() is likely always true due to a CPU cannot call profile_tick() if that CPU is offline and cpumask_set_cpu(cpu, prof_cpu_mask) is called when that CPU becomes online and cpumask_clear_cpu(cpu, prof_cpu_mask) is called when that CPU becomes offline . This test could be false during transition between online and offline. But according to include/linux/cpuhotplug.h , CPUHP_PROFILE_PREPARE belongs to PREPARE section, which means that the CPU subjected to profile_dead_cpu() cannot be inside profile_tick() (i.e. no risk of use-after-free bug) because interrupt for that CPU is disabled during PREPARE section. Therefore, this test is guaranteed to be true, and can be removed. (Since profile_hits() checks prof_buffer != NULL, we don't need to check prof_buffer != NULL here unless get_irq_regs() or user_mode() is such slow that we want to avoid when prof_buffer == NULL). do_profile_hits() is called from profile_tick() from timer interrupt only if cpumask_test_cpu(smp_processor_id(), prof_cpu_mask) is true and prof_buffer is not NULL. But syzbot is also reporting that sometimes do_profile_hits() is called while current thread is still doing vzalloc(), where prof_buffer must be NULL at this moment. This indicates that multiple threads concurrently tried to write to /sys/kernel/profiling interface, which caused that somebody else try to re-allocate prof_buffer despite somebody has already allocated prof_buffer. Fix this by using serialization. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=b1a83ab2a9eb9321fbdd Signed-off-by: Tetsuo Handa Tested-by: syzbot Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 7 +++++++ kernel/profile.c | 46 ++++++---------------------------------------- 2 files changed, 13 insertions(+), 40 deletions(-) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 07fb5987b42b..1bab21b4718f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj, const char *buf, size_t count) { int ret; + static DEFINE_MUTEX(lock); + /* + * We need serialization, for profile_setup() initializes prof_on + * value and profile_init() must not reallocate prof_buffer after + * once allocated. + */ + guard(mutex)(&lock); if (prof_on) return -EEXIST; /* diff --git a/kernel/profile.c b/kernel/profile.c index 2b775cc5c28f..4654c6cd984e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,7 +47,6 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_var_t prof_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -114,11 +113,6 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); - if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; @@ -132,7 +126,6 @@ int __ref profile_init(void) if (prof_buffer) return 0; - free_cpumask_var(prof_cpu_mask); return -ENOMEM; } @@ -267,9 +260,6 @@ static int profile_dead_cpu(unsigned int cpu) struct page *page; int i; - if (cpumask_available(prof_cpu_mask)) - cpumask_clear_cpu(cpu, prof_cpu_mask); - for (i = 0; i < 2; i++) { if (per_cpu(cpu_profile_hits, cpu)[i]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); @@ -302,14 +292,6 @@ static int profile_prepare_cpu(unsigned int cpu) return 0; } -static int profile_online_cpu(unsigned int cpu) -{ - if (cpumask_available(prof_cpu_mask)) - cpumask_set_cpu(cpu, prof_cpu_mask); - - return 0; -} - #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) @@ -334,8 +316,8 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && - cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) + /* This is the old kernel-only legacy profiling */ + if (!user_mode(regs)) profile_hit(type, (void *)profile_pc(regs)); } @@ -418,10 +400,6 @@ static const struct proc_ops profile_proc_ops = { int __ref create_proc_profile(void) { struct proc_dir_entry *entry; -#ifdef CONFIG_SMP - enum cpuhp_state online_state; -#endif - int err = 0; if (!prof_on) @@ -431,26 +409,14 @@ int __ref create_proc_profile(void) profile_prepare_cpu, profile_dead_cpu); if (err) return err; - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", - profile_online_cpu, NULL); - if (err < 0) - goto err_state_prep; - online_state = err; - err = 0; #endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); - if (!entry) - goto err_state_onl; - proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - - return err; -err_state_onl: + if (entry) + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); #ifdef CONFIG_SMP - cpuhp_remove_state(online_state); -err_state_prep: - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); + else + cpuhp_remove_state(CPUHP_PROFILE_PREPARE); #endif return err; } From 2accfdb7eff65f390c4308b0e9cb7c3fe48ad63c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 10:58:28 -0700 Subject: [PATCH 0193/1052] profiling: attempt to remove per-cpu profile flip buffer This is the really old legacy kernel profiling code, which has long since been obviated by "real profiling" (ie 'prof' and company), and mainly remains as a source of syzbot reports. There are anecdotal reports that people still use it for boot-time profiling, but it's unlikely that such use would care about the old NUMA optimizations in this code from 2004 (commit ad02973d42: "profile: 512x Altix timer interrupt livelock fix" in the BK import archive at [1]) So in order to head off future syzbot reports, let's try to simplify this code and get rid of the per-cpu profile buffers that are quite a large portion of the complexity footprint of this thing (including CPU hotplug callbacks etc). It's unlikely anybody will actually notice, or possibly, as Thomas put it: "Only people who indulge in nostalgia will notice :)". That said, if it turns out that this code is actually actively used by somebody, we can always revert this removal. Thus the "attempt" in the summary line. [ Note: in a small nod to "the profiling code can cause NUMA problems", this also removes the "increment the last entry in the profiling array on any unknown hits" logic. That would account any program counter in a module to that single counter location, and might exacerbate any NUMA cacheline bouncing issues ] Link: https://lore.kernel.org/all/CAHk-=wgs52BxT4Zjmjz8aNvHWKxf5_ThBY4bYL1Y6CTaNL2dTw@mail.gmail.com/ Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1] Cc: Thomas Gleixner Cc: Tetsuo Handa Signed-off-by: Linus Torvalds --- include/linux/cpuhotplug.h | 1 - kernel/profile.c | 183 +------------------------------------ 2 files changed, 2 insertions(+), 182 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 51ba681b915a..affdd890899e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -100,7 +100,6 @@ enum cpuhp_state { CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, - CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, diff --git a/kernel/profile.c b/kernel/profile.c index 4654c6cd984e..2dfb0f4e755c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -129,180 +129,13 @@ int __ref profile_init(void) return -ENOMEM; } -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- nyc - */ -static void __profile_flip_buffers(void *unused) -{ - int cpu = smp_processor_id(); - - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ - int i, j, cpu; - - mutex_lock(&profile_flip_mutex); - j = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; - for (i = 0; i < NR_PROFILE_HIT; ++i) { - if (!hits[i].hits) { - if (hits[i].pc) - hits[i].pc = 0; - continue; - } - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].hits = hits[i].pc = 0; - } - } - mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ - int i, cpu; - - mutex_lock(&profile_flip_mutex); - i = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); - } - mutex_unlock(&profile_flip_mutex); -} - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; - int i, j, cpu; - struct profile_hit *hits; - - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - cpu = get_cpu(); - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; - if (!hits) { - put_cpu(); - return; - } - /* - * We buffer the global profiler buffer into a per-CPU - * queue and thus reduce the number of global (and possibly - * NUMA-alien) accesses. The write-queue is self-coalescing: - */ - local_irq_save(flags); - do { - for (j = 0; j < PROFILE_GRPSZ; ++j) { - if (hits[i + j].pc == pc) { - hits[i + j].hits += nr_hits; - goto out; - } else if (!hits[i + j].hits) { - hits[i + j].pc = pc; - hits[i + j].hits = nr_hits; - goto out; - } - } - i = (i + secondary) & (NR_PROFILE_HIT - 1); - } while (i != primary); - - /* - * Add the current hit(s) and flush the write-queue out - * to the global buffer: - */ - atomic_add(nr_hits, &prof_buffer[pc]); - for (i = 0; i < NR_PROFILE_HIT; ++i) { - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].pc = hits[i].hits = 0; - } -out: - local_irq_restore(flags); - put_cpu(); -} - -static int profile_dead_cpu(unsigned int cpu) -{ - struct page *page; - int i; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); - per_cpu(cpu_profile_hits, cpu)[i] = NULL; - __free_page(page); - } - } - return 0; -} - -static int profile_prepare_cpu(unsigned int cpu) -{ - int i, node = cpu_to_mem(cpu); - struct page *page; - - per_cpu(cpu_profile_flip, cpu) = 0; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) - continue; - - page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) { - profile_dead_cpu(cpu); - return -ENOMEM; - } - per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); - - } - return 0; -} - -#else /* !CONFIG_SMP */ -#define profile_flip_buffers() do { } while (0) -#define profile_discard_flip_buffers() do { } while (0) - static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); + if (pc < prof_len) + atomic_add(nr_hits, &prof_buffer[pc]); } -#endif /* !CONFIG_SMP */ void profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -340,7 +173,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) char *pnt; unsigned long sample_step = 1UL << prof_shift; - profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) @@ -386,7 +218,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return -EINVAL; } #endif - profile_discard_flip_buffers(); memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } @@ -404,20 +235,10 @@ int __ref create_proc_profile(void) if (!prof_on) return 0; -#ifdef CONFIG_SMP - err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", - profile_prepare_cpu, profile_dead_cpu); - if (err) - return err; -#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); if (entry) proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); -#ifdef CONFIG_SMP - else - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); -#endif return err; } subsys_initcall(create_proc_profile); From 950aeefb34923fe3c28ade35fe05f24e2c5b1d55 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 17 Jul 2024 22:01:30 -0700 Subject: [PATCH 0194/1052] iommufd/device: Fix hwpt at err_unresv in iommufd_device_do_replace() The rewind routine should remove the reserved iovas added to the new hwpt. Fixes: 89db31635c87 ("iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/r/20240718050130.1956804-1-nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 9a7ec5997c61..3214a4c17c6b 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -526,7 +526,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, err_unresv: if (hwpt_is_paging(hwpt)) iommufd_group_remove_reserved_iova(igroup, - to_hwpt_paging(old_hwpt)); + to_hwpt_paging(hwpt)); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); From cec6937dd1aae1b38d147bd190cb895d06cf96d0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 12:05:06 -0700 Subject: [PATCH 0195/1052] task_work: make TWA_NMI_CURRENT handling conditional on IRQ_WORK The TWA_NMI_CURRENT handling very much depends on IRQ_WORK, but that isn't universally enabled everywhere. Maybe the IRQ_WORK infrastructure should just be unconditional - x86 ends up indirectly enabling it through unconditionally enabling PERF_EVENTS, for example. But it also gets enabled by having SMP support, or even if you just have PRINTK enabled. But in the meantime TWA_NMI_CURRENT causes tons of build failures on various odd minimal configs. Which did show up in linux-next, but despite that nobody bothered to fix it or even inform me until -rc1 was out. Fixes: 466e4d801cd4 ("task_work: Add TWA_NMI_CURRENT as an additional notify mode") Reported-by: Naresh Kamboju Reported-by: kernelci.org bot Reported-by: Guenter Roeck Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Signed-off-by: Linus Torvalds --- kernel/task_work.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c2daa7ad3f9..5d14d639ac71 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -6,12 +6,14 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +#ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); +#endif /** * task_work_add - ask the @task to execute @work->func() @@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work, if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; + if (!IS_ENABLED(CONFIG_IRQ_WORK)) + return -EINVAL; } else { /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); @@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; +#ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; +#endif default: WARN_ON_ONCE(1); break; From 0710c3d304f67f9b68f5082214e311ec8f82bd82 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 26 Jul 2024 13:18:25 +0200 Subject: [PATCH 0196/1052] dt-bindings: Batch-update Konrad Dybcio's email Use my @kernel.org address everywhere. Signed-off-by: Konrad Dybcio Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240726-topic-konrad_email-v1-3-f94665da2919@kernel.org Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml | 2 +- .../devicetree/bindings/clock/qcom,sm8350-videocc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml | 2 +- .../devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml | 2 +- .../bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml | 2 +- .../devicetree/bindings/display/panel/sony,td4353-jdi.yaml | 2 +- .../devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml | 2 +- .../devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml | 2 +- .../devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml | 2 +- Documentation/devicetree/bindings/iommu/qcom,iommu.yaml | 2 +- .../devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml | 2 +- Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml | 2 +- Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml | 2 +- Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml | 2 +- .../devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml index a584b4953e68..46403b98411f 100644 --- a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Display Clock & Reset Controller on SM6350 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm display clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml index 6b9c1d198b14..10afe984e2fb 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on MSM8994 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml index a5a29dc75ae1..1fe68e07a2b2 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on SM6125 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml index 2280b859b2ad..78e232fa95dc 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on SM6350 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml index cf19f44af774..4ff17a91344b 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM6115 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml index 374a1844a159..10a9c96a97b6 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM6125 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides clocks and power domains on diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml index fd6658cb793d..c03b30f64f35 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Camera Clock & Reset Controller on SM6350 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm camera clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml index 183b1c75dbdf..3cd422a645fd 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Display Clock & Reset Controller on SM6375 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm display clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml index 147b75a21508..de4e9066eeb8 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on SM6375 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml index cf4cad76f6c9..d9dd479c17bd 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM6375 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml index 46d1d91e3a01..5c2ecec0624e 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm SM8350 Video Clock & Reset Controller maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm video clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml index 3c2cac14e6c3..d10bb002906e 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM8450 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml index 8e8a288d318c..e22b4c433fd0 100644 --- a/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml +++ b/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm SM6375 Display MDSS maintainers: - - Konrad Dybcio + - Konrad Dybcio description: SM6375 MSM Mobile Display Subsystem (MDSS), which encapsulates sub-blocks diff --git a/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml b/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml index 2399cabf044c..dd614e077bbf 100644 --- a/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml +++ b/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: ASUS Z00T TM5P5 NT35596 5.5" 1080×1920 LCD Panel maintainers: - - Konrad Dybcio + - Konrad Dybcio description: |+ This panel seems to only be found in the Asus Z00T diff --git a/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml b/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml index 191b692125e1..032a989184ff 100644 --- a/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml +++ b/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Sony TD4353 JDI 5 / 5.7" 2160x1080 MIPI-DSI Panel maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | The Sony TD4353 JDI is a 5 (XZ2c) / 5.7 (XZ2) inch 2160x1080 diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml index 9fce7203bd42..78210791496f 100644 --- a/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml +++ b/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml @@ -8,7 +8,7 @@ title: Qualcomm RPMh Network-On-Chip Interconnect on SC7280 maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio description: | RPMh interconnect providers support system bandwidth requirements through diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml index 6c2da03f0cd2..100c68636909 100644 --- a/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml +++ b/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml @@ -8,7 +8,7 @@ title: Qualcomm RPMh Network-On-Chip Interconnect on SC8280XP maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio description: | RPMh interconnect providers support system bandwidth requirements through diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml index 3cff7e662255..300640a533dd 100644 --- a/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml +++ b/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml @@ -8,7 +8,7 @@ title: Qualcomm RPMh Network-On-Chip Interconnect on SM8450 maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio description: | RPMh interconnect providers support system bandwidth requirements through diff --git a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml index 571e5746d177..f8cebc9e8cd9 100644 --- a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml +++ b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies legacy IOMMU implementations maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm "B" family devices which are not compatible with arm-smmu have diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml index bd3cbb44c99a..e75393b3d196 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. MDM9607 TLMM block maintainers: - - Konrad Dybcio + - Konrad Dybcio description: Top Level Mode Multiplexer pin controller in Qualcomm MDM9607 SoC. diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml index a4771f87d936..b262af6be97d 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. SM6350 TLMM block maintainers: - - Konrad Dybcio + - Konrad Dybcio description: Top Level Mode Multiplexer pin controller in Qualcomm SM6350 SoC. diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml index 047f82863f9b..c11af09c3f5b 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. SM6375 TLMM block maintainers: - - Konrad Dybcio + - Konrad Dybcio description: Top Level Mode Multiplexer pin controller in Qualcomm SM6375 SoC. diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml b/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml index 7afafde17a38..61cf4fe19ca5 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml @@ -8,7 +8,7 @@ title: Qualcomm Resource Power Manager (RPM) Processor/Subsystem maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio - Stephan Gerhold description: | diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml b/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml index 9410404f87f1..ad2dcc39a5f5 100644 --- a/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. (QTI) RPM Master Stats maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | The Qualcomm RPM (Resource Power Manager) architecture includes a concept From a63507f3b11de30fd7711ec244fe354fb4a01a09 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Jul 2024 14:22:29 +0800 Subject: [PATCH 0197/1052] selftests/bpf: Drop type of connect_to_fd_opts The "type" parameter of connect_to_fd_opts() is redundant of "server_fd". Since the "type" can be obtained inside by invoking getsockopt(SO_TYPE), without passing it in as a parameter. This patch drops the "type" parameter of connect_to_fd_opts() and updates its callers. Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/50d8ce7ab7ab0c0f4d211fc7cc4ebe3d3f63424c.1721282219.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 21 ++++++++++--------- tools/testing/selftests/bpf/network_helpers.h | 2 +- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- .../selftests/bpf/prog_tests/cgroup_v1v2.c | 4 ++-- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index e0cba4178e41..15e0e0bb7553 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -328,14 +328,21 @@ int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t add return -1; } -int connect_to_fd_opts(int server_fd, int type, const struct network_helper_opts *opts) +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; - socklen_t addrlen; + socklen_t addrlen, optlen; + int type; if (!opts) opts = &default_opts; + optlen = sizeof(type); + if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) { + log_err("getsockopt(SOL_TYPE)"); + return -1; + } + addrlen = sizeof(addr); if (getsockname(server_fd, (struct sockaddr *)&addr, &addrlen)) { log_err("Failed to get server addr"); @@ -350,14 +357,8 @@ int connect_to_fd(int server_fd, int timeout_ms) struct network_helper_opts opts = { .timeout_ms = timeout_ms, }; - int type, protocol; socklen_t optlen; - - optlen = sizeof(type); - if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) { - log_err("getsockopt(SOL_TYPE)"); - return -1; - } + int protocol; optlen = sizeof(protocol); if (getsockopt(server_fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &optlen)) { @@ -366,7 +367,7 @@ int connect_to_fd(int server_fd, int timeout_ms) } opts.proto = protocol; - return connect_to_fd_opts(server_fd, type, &opts); + return connect_to_fd_opts(server_fd, &opts); } int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms) diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index aac5b94d6379..5b548c0c60de 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -71,7 +71,7 @@ int client_socket(int family, int type, int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); -int connect_to_fd_opts(int server_fd, int type, const struct network_helper_opts *opts); +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); int fastopen_connect(int server_fd, const char *data, unsigned int data_len, int timeout_ms); diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 63422f4f3896..1d494b4453f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -49,7 +49,7 @@ static bool start_test(char *addr_str, goto err; /* connect to server */ - *cli_fd = connect_to_fd_opts(*srv_fd, SOCK_STREAM, cli_opts); + *cli_fd = connect_to_fd_opts(*srv_fd, cli_opts); if (!ASSERT_NEQ(*cli_fd, -1, "connect_to_fd_opts")) goto err; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c index 9709c8db7275..addf720428f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c @@ -32,7 +32,7 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) goto out; } - fd = connect_to_fd_opts(server_fd, SOCK_STREAM, &opts); + fd = connect_to_fd_opts(server_fd, &opts); if (fd < 0) err = -1; else @@ -52,7 +52,7 @@ void test_cgroup_v1v2(void) server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); if (!ASSERT_GE(server_fd, 0, "server_fd")) return; - client_fd = connect_to_fd_opts(server_fd, SOCK_STREAM, &opts); + client_fd = connect_to_fd_opts(server_fd, &opts); if (!ASSERT_GE(client_fd, 0, "client_fd")) { close(server_fd); return; From e1ee5a48b5b27e3e7bb294f80f7429c3d0466d19 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Jul 2024 14:22:30 +0800 Subject: [PATCH 0198/1052] selftests/bpf: Drop must_fail from network_helper_opts The struct member "must_fail" of network_helper_opts() is only used in cgroup_v1v2 tests, it makes sense to drop it from network_helper_opts. Return value (fd) of connect_to_fd_opts() and the expect errno (EPERM) can be checked in cgroup_v1v2.c directly, no need to check them in connect_fd_to_addr() in network_helpers.c. This also makes connect_fd_to_addr() function useless. It can be replaced by connect(). Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/3faf336019a9a48e2e8951f4cdebf19e3ac6e441.1721282219.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 42 ++++--------------- tools/testing/selftests/bpf/network_helpers.h | 1 - .../selftests/bpf/prog_tests/cgroup_v1v2.c | 14 ++++--- 3 files changed, 16 insertions(+), 41 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 15e0e0bb7553..bc4947afa0b4 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -277,33 +277,6 @@ int client_socket(int family, int type, return -1; } -static int connect_fd_to_addr(int fd, - const struct sockaddr_storage *addr, - socklen_t addrlen, const bool must_fail) -{ - int ret; - - errno = 0; - ret = connect(fd, (const struct sockaddr *)addr, addrlen); - if (must_fail) { - if (!ret) { - log_err("Unexpected success to connect to server"); - return -1; - } - if (errno != EPERM) { - log_err("Unexpected error from connect to server"); - return -1; - } - } else { - if (ret) { - log_err("Failed to connect to server"); - return -1; - } - } - - return 0; -} - int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, const struct network_helper_opts *opts) { @@ -318,14 +291,13 @@ int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t add return -1; } - if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail)) - goto error_close; + if (connect(fd, (const struct sockaddr *)addr, addrlen)) { + log_err("Failed to connect to server"); + save_errno_close(fd); + return -1; + } return fd; - -error_close: - save_errno_close(fd); - return -1; } int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) @@ -383,8 +355,10 @@ int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms) return -1; } - if (connect_fd_to_addr(client_fd, &addr, len, false)) + if (connect(client_fd, (const struct sockaddr *)&addr, len)) { + log_err("Failed to connect to server"); return -1; + } return 0; } diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 5b548c0c60de..f39eeb5a4594 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -23,7 +23,6 @@ typedef __u16 __sum16; struct network_helper_opts { int timeout_ms; - bool must_fail; int proto; /* +ve: Passed to listen() as-is. * 0: Default when the test does not set diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c index addf720428f7..64abba72ac10 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c @@ -9,9 +9,6 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) { - struct network_helper_opts opts = { - .must_fail = true, - }; struct connect4_dropper *skel; int fd, err = 0; @@ -32,11 +29,16 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) goto out; } - fd = connect_to_fd_opts(server_fd, &opts); - if (fd < 0) + errno = 0; + fd = connect_to_fd_opts(server_fd, NULL); + if (fd >= 0) { + log_err("Unexpected success to connect to server"); err = -1; - else close(fd); + } else if (errno != EPERM) { + log_err("Unexpected errno from connect to server"); + err = -1; + } out: connect4_dropper__destroy(skel); return err; From c70b2d9027ca39e84d4ee01da78a70308ce1fd4f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Jul 2024 14:22:31 +0800 Subject: [PATCH 0199/1052] selftests/bpf: Add connect_to_addr_str helper Similar to connect_to_addr() helper for connecting to a server with the given sockaddr_storage type address, this patch adds a new helper named connect_to_addr_str() for connecting to a server with the given string type address "addr_str", together with its "family" and "port" as other parameters of connect_to_addr_str(). In connect_to_addr_str(), the parameters "family", "addr_str" and "port" are used to create a sockaddr_storage type address "addr" by invoking make_sockaddr(). Then pass this "addr" together with "addrlen", "type" and "opts" to connect_to_addr(). Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/647e82170831558dbde132a7a3d86df660dba2c4.1721282219.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 15 +++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 2 ++ 2 files changed, 17 insertions(+) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index bc4947afa0b4..9c98a60cf1e2 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -300,6 +300,21 @@ int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t add return fd; } +int connect_to_addr_str(int family, int type, const char *addr_str, __u16 port, + const struct network_helper_opts *opts) +{ + struct sockaddr_storage addr; + socklen_t addrlen; + + if (!opts) + opts = &default_opts; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return connect_to_addr(type, &addr, addrlen, opts); +} + int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index f39eeb5a4594..cce56955371f 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -69,6 +69,8 @@ int client_socket(int family, int type, const struct network_helper_opts *opts); int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, const struct network_helper_opts *opts); +int connect_to_addr_str(int family, int type, const char *addr_str, __u16 port, + const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); From df862de41fcde6a0a4906647b0cacec2a8db5cf3 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 14 Jul 2024 16:15:34 +0200 Subject: [PATCH 0200/1052] bpf: Replace 8 seq_puts() calls by seq_putc() calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single line breaks should occasionally be put into a sequence. Thus use the corresponding function “seq_putc”. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e26b7df9-cd63-491f-85e8-8cabe60a85e5@web.de --- kernel/bpf/arraymap.c | 6 +++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/local_storage.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index feabc0193852..188e3c2effb2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -494,7 +494,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, if (map->btf_key_type_id) seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); rcu_read_unlock(); } @@ -515,7 +515,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } seq_puts(m, "}\n"); @@ -993,7 +993,7 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, prog_id = prog_fd_array_sys_lookup_elem(ptr); btf_type_seq_show(map->btf, map->btf_value_type_id, &prog_id, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } } diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 0d515ec57aa5..bb3eabc0dc76 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -837,7 +837,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, btf_type_seq_show(st_map->btf, map->btf_vmlinux_value_type_id, value, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } kfree(value); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 06115f8728e8..be1f64c20125 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1586,7 +1586,7 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); rcu_read_unlock(); } @@ -2450,7 +2450,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } seq_puts(m, "}\n"); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index a04f505aefe9..3969eb0382af 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -431,7 +431,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, &READ_ONCE(storage->buf)->data[0], m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } else { seq_puts(m, ": {\n"); for_each_possible_cpu(cpu) { @@ -439,7 +439,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(storage->percpu_buf, cpu), m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } seq_puts(m, "}\n"); } From f157f9cb85b49745754f8c301c59c6ca110e865f Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 15 Jul 2024 11:12:30 +0200 Subject: [PATCH 0201/1052] bpf: Simplify character output in seq_print_delegate_opts() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single characters should be put into a sequence. Thus use the corresponding function “seq_putc” for two selected calls. This issue was transformed by using the Coccinelle software. Suggested-by: Christophe Jaillet Signed-off-by: Markus Elfring Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/abde0992-3d71-44d2-ab27-75b382933a22@web.de --- kernel/bpf/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index af5d2ffadd70..d8fc5eba529d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -709,10 +709,10 @@ static void seq_print_delegate_opts(struct seq_file *m, msk = 1ULL << e->val; if (delegate_msk & msk) { /* emit lower-case name without prefix */ - seq_printf(m, "%c", first ? '=' : ':'); + seq_putc(m, first ? '=' : ':'); name += pfx_len; while (*name) { - seq_printf(m, "%c", tolower(*name)); + seq_putc(m, tolower(*name)); name++; } From 844f7315e77a017fe90652b411e88b119c782a1f Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Thu, 18 Jul 2024 22:57:43 +0000 Subject: [PATCH 0202/1052] selftests/bpf: Use auto-dependencies for test objects Make use of -M compiler options when building .test.o objects to generate .d files and avoid re-building all tests every time. Previously, if a single test bpf program under selftests/bpf/progs/*.c has changed, make would rebuild all the *.bpf.o, *.skel.h and *.test.o objects, which is a lot of unnecessary work. A typical dependency chain is: progs/x.c -> x.bpf.o -> x.skel.h -> x.test.o -> trunner_binary However for many tests it's not a 1:1 mapping by name, and so far %.test.o have been simply dependent on all %.skel.h files, and %.skel.h files on all %.bpf.o objects. Avoid full rebuilds by instructing the compiler (via -MMD) to produce *.d files with real dependencies, and appropriately including them. Exploit make feature that rebuilds included makefiles if they were changed by setting %.test.d as prerequisite for %.test.o files. A couple of examples of compilation time speedup (after the first clean build): $ touch progs/verifier_and.c && time make -j8 Before: real 0m16.651s After: real 0m2.245s $ touch progs/read_vsyscall.c && time make -j8 Before: real 0m15.743s After: real 0m1.575s A drawback of this change is that now there is an overhead due to make processing lots of .d files, which potentially may slow down unrelated targets. However a time to make all from scratch hasn't changed significantly: $ make clean && time make -j8 Before: real 1m31.148s After: real 1m30.309s Suggested-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/VJihUTnvtwEgv_mOnpfy7EgD9D2MPNoHO-MlANeLIzLJPGhDeyOuGKIYyKgk0O6KPjfM-MuhtvPwZcngN8WFqbTnTRyCSMc2aMZ1ODm1T_g=@pm.me --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 43 +++++++++++++++++++------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 5025401323af..4e4aae8aa7ec 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -31,6 +31,7 @@ test_tcp_check_syncookie_user test_sysctl xdping test_cpp +*.d *.subskel.h *.skel.h *.lskel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dd49c1d23a60..05b234248b38 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -477,7 +477,8 @@ xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o xdp_features.skel.h-deps := xdp_features.bpf.o -LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) +LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) +LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. @@ -556,7 +557,11 @@ $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) -$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) +$(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/% + +# .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites +.SECONDEXPANSION: +$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o)) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) @@ -566,6 +571,14 @@ $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ $(Q)$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) $(Q)rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) + +# When the compiler generates a %.d file, only skel basenames (not +# full paths) are specified as prerequisites for corresponding %.o +# file. This target makes %.skel.h basename dependent on full paths, +# linking generated %.d dependency with actual %.skel.h files. +$(notdir %.skel.h): $(TRUNNER_OUTPUT)/%.skel.h + @true + endif # ensure we set up tests.h header generation rule just once @@ -583,14 +596,19 @@ endif # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TESTS_DIR)/%.c \ - $(TRUNNER_EXTRA_HDRS) \ - $(TRUNNER_BPF_OBJS) \ - $(TRUNNER_BPF_SKELS) \ - $(TRUNNER_BPF_LSKELS) \ - $(TRUNNER_BPF_SKELS_LINKED) \ - $$(BPFOBJ) | $(TRUNNER_OUTPUT) + $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + +$(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ + $(TRUNNER_TESTS_DIR)/%.c \ + $(TRUNNER_EXTRA_HDRS) \ + $(TRUNNER_BPF_SKELS) \ + $(TRUNNER_BPF_LSKELS) \ + $(TRUNNER_BPF_SKELS_LINKED) \ + $$(BPFOBJ) | $(TRUNNER_OUTPUT) + +include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ @@ -608,6 +626,9 @@ ifneq ($2:$(OUTPUT),:$(shell pwd)) $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif +# some X.test.o files have runtime dependencies on Y.bpf.o files +$(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) + $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(RESOLVE_BTFIDS) \ @@ -768,8 +789,8 @@ $(OUTPUT)/uprobe_multi: uprobe_multi.c EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ - feature bpftool \ - $(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h *.subskel.h \ + feature bpftool \ + $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ no_alu32 cpuv4 bpf_gcc bpf_testmod.ko \ bpf_test_no_cfi.ko \ liburandom_read.so) From 4bf79f9be434e000c8e12fe83b2f4402480f1460 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:53 -0700 Subject: [PATCH 0203/1052] bpf: Track equal scalars history on per-instruction level Use bpf_verifier_state->jmp_history to track which registers were updated by find_equal_scalars() (renamed to collect_linked_regs()) when conditional jump was verified. Use recorded information in backtrack_insn() to propagate precision. E.g. for the following program: while verifying instructions 1: r1 = r0 | 2: if r1 < 8 goto ... | push r0,r1 as linked registers in jmp_history 3: if r0 > 16 goto ... | push r0,r1 as linked registers in jmp_history 4: r2 = r10 | 5: r2 += r0 v mark_chain_precision(r0) while doing mark_chain_precision(r0) 5: r2 += r0 | mark r0 precise 4: r2 = r10 | 3: if r0 > 16 goto ... | mark r0,r1 as precise 2: if r1 < 8 goto ... | mark r0,r1 as precise 1: r1 = r0 v Technically, do this as follows: - Use 10 bits to identify each register that gains range because of sync_linked_regs(): - 3 bits for frame number; - 6 bits for register or stack slot number; - 1 bit to indicate if register is spilled. - Use u64 as a vector of 6 such records + 4 bits for vector length. - Augment struct bpf_jmp_history_entry with a field 'linked_regs' representing such vector. - When doing check_cond_jmp_op() remember up to 6 registers that gain range because of sync_linked_regs() in such a vector. - Don't propagate range information and reset IDs for registers that don't fit in 6-value vector. - Push a pair {instruction index, linked registers vector} to bpf_verifier_state->jmp_history. - When doing backtrack_insn() check if any of recorded linked registers is currently marked precise, if so mark all linked registers as precise. This also requires fixes for two test_verifier tests: - precise: test 1 - precise: test 2 Both tests contain the following instruction sequence: 19: (bf) r2 = r9 ; R2=scalar(id=3) R9=scalar(id=3) 20: (a5) if r2 < 0x8 goto pc+1 ; R2=scalar(id=3,umin=8) 21: (95) exit 22: (07) r2 += 1 ; R2_w=scalar(id=3+1,...) 23: (bf) r1 = r10 ; R1_w=fp0 R10=fp0 24: (07) r1 += -8 ; R1_w=fp-8 25: (b7) r3 = 0 ; R3_w=0 26: (85) call bpf_probe_read_kernel#113 The call to bpf_probe_read_kernel() at (26) forces r2 to be precise. Previously, this forced all registers with same id to become precise immediately when mark_chain_precision() is called. After this change, the precision is propagated to registers sharing same id only when 'if' instruction is backtracked. Hence verification log for both tests is changed: regs=r2,r9 -> regs=r2 for instructions 25..20. Fixes: 904e6ddf4133 ("bpf: Use scalar ids in mark_chain_precision()") Reported-by: Hao Sun Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-2-eddyz87@gmail.com Closes: https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/ --- include/linux/bpf_verifier.h | 4 + kernel/bpf/verifier.c | 245 ++++++++++++++++-- .../bpf/progs/verifier_subprog_precision.c | 2 +- .../testing/selftests/bpf/verifier/precise.c | 20 +- 4 files changed, 239 insertions(+), 32 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6503c85b10a3..731a0a4ac13c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -371,6 +371,10 @@ struct bpf_jmp_history_entry { u32 prev_idx : 22; /* special flags, e.g., whether insn is doing register stack spill/load */ u32 flags : 10; + /* additional registers that need precision tracking when this + * jump is backtracked, vector of six 10-bit records + */ + u64 linked_regs; }; /* Maximum number of register states that can exist at once */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cb5441ad75f..8dade7b51430 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3335,9 +3335,87 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].jmp_point; } +#define LR_FRAMENO_BITS 3 +#define LR_SPI_BITS 6 +#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) +#define LR_SIZE_BITS 4 +#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1) +#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1) +#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1) +#define LR_SPI_OFF LR_FRAMENO_BITS +#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS) +#define LINKED_REGS_MAX 6 + +struct linked_reg { + u8 frameno; + union { + u8 spi; + u8 regno; + }; + bool is_reg; +}; + +struct linked_regs { + int cnt; + struct linked_reg entries[LINKED_REGS_MAX]; +}; + +static struct linked_reg *linked_regs_push(struct linked_regs *s) +{ + if (s->cnt < LINKED_REGS_MAX) + return &s->entries[s->cnt++]; + + return NULL; +} + +/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track + * number of elements currently in stack. + * Pack one history entry for linked registers as 10 bits in the following format: + * - 3-bits frameno + * - 6-bits spi_or_reg + * - 1-bit is_reg + */ +static u64 linked_regs_pack(struct linked_regs *s) +{ + u64 val = 0; + int i; + + for (i = 0; i < s->cnt; ++i) { + struct linked_reg *e = &s->entries[i]; + u64 tmp = 0; + + tmp |= e->frameno; + tmp |= e->spi << LR_SPI_OFF; + tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF; + + val <<= LR_ENTRY_BITS; + val |= tmp; + } + val <<= LR_SIZE_BITS; + val |= s->cnt; + return val; +} + +static void linked_regs_unpack(u64 val, struct linked_regs *s) +{ + int i; + + s->cnt = val & LR_SIZE_MASK; + val >>= LR_SIZE_BITS; + + for (i = 0; i < s->cnt; ++i) { + struct linked_reg *e = &s->entries[i]; + + e->frameno = val & LR_FRAMENO_MASK; + e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK; + e->is_reg = (val >> LR_IS_REG_OFF) & 0x1; + val >>= LR_ENTRY_BITS; + } +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags) + int insn_flags, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -3353,6 +3431,10 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + WARN_ONCE(env->cur_hist_ent->linked_regs != 0, + "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", + env->insn_idx, env->cur_hist_ent->linked_regs); + env->cur_hist_ent->linked_regs = linked_regs; return 0; } @@ -3367,6 +3449,7 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -3532,6 +3615,11 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) return bt->reg_masks[bt->frame] & (1 << reg); } +static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) +{ + return bt->reg_masks[frame] & (1 << reg); +} + static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) { return bt->stack_masks[frame] & (1ull << slot); @@ -3576,6 +3664,42 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) } } +/* If any register R in hist->linked_regs is marked as precise in bt, + * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. + */ +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +{ + struct linked_regs linked_regs; + bool some_precise = false; + int i; + + if (!hist || hist->linked_regs == 0) + return; + + linked_regs_unpack(hist->linked_regs, &linked_regs); + for (i = 0; i < linked_regs.cnt; ++i) { + struct linked_reg *e = &linked_regs.entries[i]; + + if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) || + (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) { + some_precise = true; + break; + } + } + + if (!some_precise) + return; + + for (i = 0; i < linked_regs.cnt; ++i) { + struct linked_reg *e = &linked_regs.entries[i]; + + if (e->is_reg) + bt_set_frame_reg(bt, e->frameno, e->regno); + else + bt_set_frame_slot(bt, e->frameno, e->spi); + } +} + static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); /* For given verifier state backtrack_insn() is called from the last insn to @@ -3615,6 +3739,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } + /* If there is a history record that some registers gained range at this insn, + * propagate precision marks to those registers, so that bt_is_reg_set() + * accounts for these registers. + */ + bt_sync_linked_regs(bt, hist); + if (class == BPF_ALU || class == BPF_ALU64) { if (!bt_is_reg_set(bt, dreg)) return 0; @@ -3844,7 +3974,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, */ bt_set_reg(bt, dreg); bt_set_reg(bt, sreg); - /* else dreg K + } else if (BPF_SRC(insn->code) == BPF_K) { + /* dreg K * Only dreg still needs precision before * this insn, so for the K-based conditional * there is nothing new to be marked. @@ -3862,6 +3993,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* to be analyzed */ return -ENOTSUPP; } + /* Propagate precision marks to linked registers, to account for + * registers marked as precise in this function. + */ + bt_sync_linked_regs(bt, hist); return 0; } @@ -4456,7 +4591,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, if (!src_reg->id && !tnum_is_const(src_reg->var_off)) /* Ensure that src_reg has a valid ID that will be copied to - * dst_reg and then will be used by find_equal_scalars() to + * dst_reg and then will be used by sync_linked_regs() to * propagate min/max range. */ src_reg->id = ++env->id_gen; @@ -4625,7 +4760,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -4930,7 +5065,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -14099,7 +14234,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, u64 val = reg_const_value(src_reg, alu32); if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in find_equal_scalars() later */ + /* prevent overflow in sync_linked_regs() later */ val > (u32)S32_MAX) { /* * If the register already went through rX += val @@ -14114,7 +14249,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, } else { /* * Make sure ID is cleared otherwise dst_reg min/max could be - * incorrectly propagated into other registers by find_equal_scalars() + * incorrectly propagated into other registers by sync_linked_regs() */ dst_reg->id = 0; } @@ -14264,7 +14399,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly - * propagated into src_reg by find_equal_scalars() + * propagated into src_reg by sync_linked_regs() */ if (!is_src_reg_u32) dst_reg->id = 0; @@ -15087,14 +15222,66 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, return true; } -static void find_equal_scalars(struct bpf_verifier_state *vstate, - struct bpf_reg_state *known_reg) +static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg, + u32 id, u32 frameno, u32 spi_or_reg, bool is_reg) +{ + struct linked_reg *e; + + if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id) + return; + + e = linked_regs_push(reg_set); + if (e) { + e->frameno = frameno; + e->is_reg = is_reg; + e->regno = spi_or_reg; + } else { + reg->id = 0; + } +} + +/* For all R being scalar registers or spilled scalar registers + * in verifier state, save R in linked_regs if R->id == id. + * If there are too many Rs sharing same id, reset id for leftover Rs. + */ +static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, + struct linked_regs *linked_regs) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + id = id & ~BPF_ADD_CONST; + for (i = vstate->curframe; i >= 0; i--) { + func = vstate->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + __collect_linked_regs(linked_regs, reg, id, i, j, true); + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + __collect_linked_regs(linked_regs, reg, id, i, j, false); + } + } +} + +/* For all R in linked_regs, copy known_reg range into R + * if R->id == known_reg->id. + */ +static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, + struct linked_regs *linked_regs) { struct bpf_reg_state fake_reg; - struct bpf_func_state *state; struct bpf_reg_state *reg; + struct linked_reg *e; + int i; - bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + for (i = 0; i < linked_regs->cnt; ++i) { + e = &linked_regs->entries[i]; + reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno] + : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr; if (reg->type != SCALAR_VALUE || reg == known_reg) continue; if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) @@ -15112,7 +15299,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, copy_register_state(reg, known_reg); /* * Must preserve off, id and add_const flag, - * otherwise another find_equal_scalars() will be incorrect. + * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; @@ -15120,7 +15307,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); } - })); + } } static int check_cond_jmp_op(struct bpf_verifier_env *env, @@ -15131,6 +15318,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; + struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -15245,6 +15433,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return 0; } + /* Push scalar registers sharing same ID to jump history, + * do this before creating 'other_branch', so that both + * 'this_branch' and 'other_branch' share this history + * if parent state is created. + */ + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id) + collect_linked_regs(this_branch, src_reg->id, &linked_regs); + if (dst_reg->type == SCALAR_VALUE && dst_reg->id) + collect_linked_regs(this_branch, dst_reg->id, &linked_regs); + if (linked_regs.cnt > 1) { + err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + if (err) + return err; + } + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); if (!other_branch) @@ -15275,13 +15478,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - find_equal_scalars(this_branch, src_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + sync_linked_regs(this_branch, src_reg, &linked_regs); + sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { - find_equal_scalars(this_branch, dst_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); + sync_linked_regs(this_branch, dst_reg, &linked_regs); + sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs); } /* if one pointer register is compared to another pointer @@ -16770,7 +16973,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * * First verification path is [1-6]: * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; - * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark + * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark * r7 <= X, because r6 and r7 share same id. * Next verification path is [1-4, 6]. * @@ -17563,7 +17766,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0); + err = err ? : push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -17831,7 +18034,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0); + err = push_jmp_history(env, state, 0, 0); if (err) return err; } diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 6a6fad625f7e..9d415f7ce599 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -278,7 +278,7 @@ __msg("mark_precise: frame0: last_idx 14 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4") -__msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0") +__msg("mark_precise: frame0: regs=r0,r6 stack= before 10: (bf) r6 = r0") __msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop") /* State entering callback body popped from states stack */ __msg("from 9 to 17: frame1:") diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 90643ccc221d..64d722199e8f 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -39,11 +39,11 @@ .result = VERBOSE_ACCEPT, .errstr = "mark_precise: frame0: last_idx 26 first_idx 20\ - mark_precise: frame0: regs=r2,r9 stack= before 25\ - mark_precise: frame0: regs=r2,r9 stack= before 24\ - mark_precise: frame0: regs=r2,r9 stack= before 23\ - mark_precise: frame0: regs=r2,r9 stack= before 22\ - mark_precise: frame0: regs=r2,r9 stack= before 20\ + mark_precise: frame0: regs=r2 stack= before 25\ + mark_precise: frame0: regs=r2 stack= before 24\ + mark_precise: frame0: regs=r2 stack= before 23\ + mark_precise: frame0: regs=r2 stack= before 22\ + mark_precise: frame0: regs=r2 stack= before 20\ mark_precise: frame0: parent state regs=r2,r9 stack=:\ mark_precise: frame0: last_idx 19 first_idx 10\ mark_precise: frame0: regs=r2,r9 stack= before 19\ @@ -100,11 +100,11 @@ .errstr = "26: (85) call bpf_probe_read_kernel#113\ mark_precise: frame0: last_idx 26 first_idx 22\ - mark_precise: frame0: regs=r2,r9 stack= before 25\ - mark_precise: frame0: regs=r2,r9 stack= before 24\ - mark_precise: frame0: regs=r2,r9 stack= before 23\ - mark_precise: frame0: regs=r2,r9 stack= before 22\ - mark_precise: frame0: parent state regs=r2,r9 stack=:\ + mark_precise: frame0: regs=r2 stack= before 25\ + mark_precise: frame0: regs=r2 stack= before 24\ + mark_precise: frame0: regs=r2 stack= before 23\ + mark_precise: frame0: regs=r2 stack= before 22\ + mark_precise: frame0: parent state regs=r2 stack=:\ mark_precise: frame0: last_idx 20 first_idx 20\ mark_precise: frame0: regs=r2,r9 stack= before 20\ mark_precise: frame0: parent state regs=r2,r9 stack=:\ From 842edb5507a1038e009d27e69d13b94b6f085763 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:54 -0700 Subject: [PATCH 0204/1052] bpf: Remove mark_precise_scalar_ids() Function mark_precise_scalar_ids() is superseded by bt_sync_linked_regs() and equal scalars tracking in jump history. mark_precise_scalar_ids() propagates precision over registers sharing same ID on parent/child state boundaries, while jump history records allow bt_sync_linked_regs() to propagate same information with instruction level granularity, which is strictly more precise. This commit removes mark_precise_scalar_ids() and updates test cases in progs/verifier_scalar_ids to reflect new verifier behavior. The tests are updated in the following manner: - mark_precise_scalar_ids() propagated precision regardless of presence of conditional jumps, while new jump history based logic only kicks in when conditional jumps are present. Hence test cases are augmented with conditional jumps to still trigger precision propagation. - As equal scalars tracking no longer relies on parent/child state boundaries some test cases are no longer interesting, such test cases are removed, namely: - precision_same_state and precision_cross_state are superseded by linked_regs_bpf_k; - precision_same_state_broken_link and equal_scalars_broken_link are superseded by linked_regs_broken_link. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-3-eddyz87@gmail.com --- kernel/bpf/verifier.c | 115 ------------ .../selftests/bpf/progs/verifier_scalar_ids.c | 171 ++++++------------ .../testing/selftests/bpf/verifier/precise.c | 8 +- 3 files changed, 59 insertions(+), 235 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8dade7b51430..f8c474e8e597 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4124,96 +4124,6 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ } } -static bool idset_contains(struct bpf_idset *s, u32 id) -{ - u32 i; - - for (i = 0; i < s->count; ++i) - if (s->ids[i] == (id & ~BPF_ADD_CONST)) - return true; - - return false; -} - -static int idset_push(struct bpf_idset *s, u32 id) -{ - if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids))) - return -EFAULT; - s->ids[s->count++] = id & ~BPF_ADD_CONST; - return 0; -} - -static void idset_reset(struct bpf_idset *s) -{ - s->count = 0; -} - -/* Collect a set of IDs for all registers currently marked as precise in env->bt. - * Mark all registers with these IDs as precise. - */ -static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_idset *precise_ids = &env->idset_scratch; - struct backtrack_state *bt = &env->bt; - struct bpf_func_state *func; - struct bpf_reg_state *reg; - DECLARE_BITMAP(mask, 64); - int i, fr; - - idset_reset(precise_ids); - - for (fr = bt->frame; fr >= 0; fr--) { - func = st->frame[fr]; - - bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); - for_each_set_bit(i, mask, 32) { - reg = &func->regs[i]; - if (!reg->id || reg->type != SCALAR_VALUE) - continue; - if (idset_push(precise_ids, reg->id)) - return -EFAULT; - } - - bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); - for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) - break; - if (!is_spilled_scalar_reg(&func->stack[i])) - continue; - reg = &func->stack[i].spilled_ptr; - if (!reg->id) - continue; - if (idset_push(precise_ids, reg->id)) - return -EFAULT; - } - } - - for (fr = 0; fr <= st->curframe; ++fr) { - func = st->frame[fr]; - - for (i = BPF_REG_0; i < BPF_REG_10; ++i) { - reg = &func->regs[i]; - if (!reg->id) - continue; - if (!idset_contains(precise_ids, reg->id)) - continue; - bt_set_frame_reg(bt, fr, i); - } - for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) { - if (!is_spilled_scalar_reg(&func->stack[i])) - continue; - reg = &func->stack[i].spilled_ptr; - if (!reg->id) - continue; - if (!idset_contains(precise_ids, reg->id)) - continue; - bt_set_frame_slot(bt, fr, i); - } - } - - return 0; -} - /* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) @@ -4346,31 +4256,6 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bt->frame, last_idx, first_idx, subseq_idx); } - /* If some register with scalar ID is marked as precise, - * make sure that all registers sharing this ID are also precise. - * This is needed to estimate effect of find_equal_scalars(). - * Do this at the last instruction of each state, - * bpf_reg_state::id fields are valid for these instructions. - * - * Allows to track precision in situation like below: - * - * r2 = unknown value - * ... - * --- state #0 --- - * ... - * r1 = r2 // r1 and r2 now share the same ID - * ... - * --- state #1 {r1.id = A, r2.id = A} --- - * ... - * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1 - * ... - * --- state #2 {r1.id = A, r2.id = A} --- - * r3 = r10 - * r3 += r1 // need to mark both r1 and r2 - */ - if (mark_precise_scalar_ids(env, st)) - return -EFAULT; - if (last_idx < 0) { /* we are at the entry into subprog, which * is expected for global funcs, but only if diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 13b29a7faa71..b642d5c24154 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -5,54 +5,27 @@ #include "bpf_misc.h" /* Check that precision marks propagate through scalar IDs. - * Registers r{0,1,2} have the same scalar ID at the moment when r0 is - * marked to be precise, this mark is immediately propagated to r{1,2}. + * Registers r{0,1,2} have the same scalar ID. + * Range information is propagated for scalars sharing same ID. + * Check that precision mark for r0 causes precision marks for r{1,2} + * when range information is propagated for 'if ' insn. */ SEC("socket") __success __log_level(2) -__msg("frame0: regs=r0,r1,r2 stack= before 4: (bf) r3 = r10") -__msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") -__flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_same_state(void) -{ - asm volatile ( - /* r0 = random number up to 0xff */ - "call %[bpf_ktime_get_ns];" - "r0 &= 0xff;" - /* tie r0.id == r1.id == r2.id */ - "r1 = r0;" - "r2 = r0;" - /* force r0 to be precise, this immediately marks r1 and r2 as - * precise as well because of shared IDs - */ - "r3 = r10;" - "r3 += r0;" - "r0 = 0;" - "exit;" - : - : __imm(bpf_ktime_get_ns) - : __clobber_all); -} - -/* Same as precision_same_state, but mark propagates through state / - * parent state boundary. - */ -SEC("socket") -__success __log_level(2) -__msg("frame0: last_idx 6 first_idx 5 subseq_idx -1") -__msg("frame0: regs=r0,r1,r2 stack= before 5: (bf) r3 = r10") +/* first 'if' branch */ +__msg("6: (0f) r3 += r0") +__msg("frame0: regs=r0 stack= before 4: (25) if r1 > 0x7 goto pc+0") __msg("frame0: parent state regs=r0,r1,r2 stack=:") -__msg("frame0: regs=r0,r1,r2 stack= before 4: (05) goto pc+0") __msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") -__msg("frame0: parent state regs=r0 stack=:") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") +/* second 'if' branch */ +__msg("from 4 to 5: ") +__msg("6: (0f) r3 += r0") +__msg("frame0: regs=r0 stack= before 5: (bf) r3 = r10") +__msg("frame0: regs=r0 stack= before 4: (25) if r1 > 0x7 goto pc+0") +/* parent state already has r{0,1,2} as precise */ +__msg("frame0: parent state regs= stack=:") __flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_cross_state(void) +__naked void linked_regs_bpf_k(void) { asm volatile ( /* r0 = random number up to 0xff */ @@ -61,9 +34,8 @@ __naked void precision_cross_state(void) /* tie r0.id == r1.id == r2.id */ "r1 = r0;" "r2 = r0;" - /* force checkpoint */ - "goto +0;" - /* force r0 to be precise, this immediately marks r1 and r2 as + "if r1 > 7 goto +0;" + /* force r0 to be precise, this eventually marks r1 and r2 as * precise as well because of shared IDs */ "r3 = r10;" @@ -75,19 +47,18 @@ __naked void precision_cross_state(void) : __clobber_all); } -/* Same as precision_same_state, but break one of the +/* Same as linked_regs_bpf_k, but break one of the * links, note that r1 is absent from regs=... in __msg below. */ SEC("socket") __success __log_level(2) -__msg("frame0: regs=r0,r2 stack= before 5: (bf) r3 = r10") -__msg("frame0: regs=r0,r2 stack= before 4: (b7) r1 = 0") -__msg("frame0: regs=r0,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") +__msg("7: (0f) r3 += r0") +__msg("frame0: regs=r0 stack= before 6: (bf) r3 = r10") +__msg("frame0: parent state regs=r0 stack=:") +__msg("frame0: regs=r0 stack= before 5: (25) if r0 > 0x7 goto pc+0") +__msg("frame0: parent state regs=r0,r2 stack=:") __flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_same_state_broken_link(void) +__naked void linked_regs_broken_link(void) { asm volatile ( /* r0 = random number up to 0xff */ @@ -100,54 +71,9 @@ __naked void precision_same_state_broken_link(void) * compared to the previous test */ "r1 = 0;" - /* force r0 to be precise, this immediately marks r1 and r2 as - * precise as well because of shared IDs - */ - "r3 = r10;" - "r3 += r0;" - "r0 = 0;" - "exit;" - : - : __imm(bpf_ktime_get_ns) - : __clobber_all); -} - -/* Same as precision_same_state_broken_link, but with state / - * parent state boundary. - */ -SEC("socket") -__success __log_level(2) -__msg("frame0: regs=r0,r2 stack= before 6: (bf) r3 = r10") -__msg("frame0: regs=r0,r2 stack= before 5: (b7) r1 = 0") -__msg("frame0: parent state regs=r0,r2 stack=:") -__msg("frame0: regs=r0,r1,r2 stack= before 4: (05) goto pc+0") -__msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") -__msg("frame0: parent state regs=r0 stack=:") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") -__flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_cross_state_broken_link(void) -{ - asm volatile ( - /* r0 = random number up to 0xff */ - "call %[bpf_ktime_get_ns];" - "r0 &= 0xff;" - /* tie r0.id == r1.id == r2.id */ - "r1 = r0;" - "r2 = r0;" - /* force checkpoint, although link between r1 and r{0,2} is - * broken by the next statement current precision tracking - * algorithm can't react to it and propagates mark for r1 to - * the parent state. - */ - "goto +0;" - /* break link for r1, this is the only line that differs - * compared to precision_cross_state() - */ - "r1 = 0;" - /* force r0 to be precise, this immediately marks r1 and r2 as - * precise as well because of shared IDs + "if r0 > 7 goto +0;" + /* force r0 to be precise, + * this eventually marks r2 as precise because of shared IDs */ "r3 = r10;" "r3 += r0;" @@ -164,10 +90,16 @@ __naked void precision_cross_state_broken_link(void) */ SEC("socket") __success __log_level(2) -__msg("11: (0f) r2 += r1") +__msg("12: (0f) r2 += r1") /* Current state */ -__msg("frame2: last_idx 11 first_idx 10 subseq_idx -1") -__msg("frame2: regs=r1 stack= before 10: (bf) r2 = r10") +__msg("frame2: last_idx 12 first_idx 11 subseq_idx -1 ") +__msg("frame2: regs=r1 stack= before 11: (bf) r2 = r10") +__msg("frame2: parent state regs=r1 stack=") +__msg("frame1: parent state regs= stack=") +__msg("frame0: parent state regs= stack=") +/* Parent state */ +__msg("frame2: last_idx 10 first_idx 10 subseq_idx 11 ") +__msg("frame2: regs=r1 stack= before 10: (25) if r1 > 0x7 goto pc+0") __msg("frame2: parent state regs=r1 stack=") /* frame1.r{6,7} are marked because mark_precise_scalar_ids() * looks for all registers with frame2.r1.id in the current state @@ -192,7 +124,7 @@ __msg("frame1: regs=r1 stack= before 4: (85) call pc+1") __msg("frame0: parent state regs=r1,r6 stack=") /* Parent state */ __msg("frame0: last_idx 3 first_idx 1 subseq_idx 4") -__msg("frame0: regs=r0,r1,r6 stack= before 3: (bf) r6 = r0") +__msg("frame0: regs=r1,r6 stack= before 3: (bf) r6 = r0") __msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") __msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") __flag(BPF_F_TEST_STATE_FREQ) @@ -230,7 +162,8 @@ static __naked __noinline __used void precision_many_frames__bar(void) { asm volatile ( - /* force r1 to be precise, this immediately marks: + "if r1 > 7 goto +0;" + /* force r1 to be precise, this eventually marks: * - bar frame r1 * - foo frame r{1,6,7} * - main frame r{1,6} @@ -247,14 +180,16 @@ void precision_many_frames__bar(void) */ SEC("socket") __success __log_level(2) +__msg("11: (0f) r2 += r1") /* foo frame */ -__msg("frame1: regs=r1 stack=-8,-16 before 9: (bf) r2 = r10") +__msg("frame1: regs=r1 stack= before 10: (bf) r2 = r10") +__msg("frame1: regs=r1 stack= before 9: (25) if r1 > 0x7 goto pc+0") __msg("frame1: regs=r1 stack=-8,-16 before 8: (7b) *(u64 *)(r10 -16) = r1") __msg("frame1: regs=r1 stack=-8 before 7: (7b) *(u64 *)(r10 -8) = r1") __msg("frame1: regs=r1 stack= before 4: (85) call pc+2") /* main frame */ -__msg("frame0: regs=r0,r1 stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r1") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") +__msg("frame0: regs=r1 stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r1") +__msg("frame0: regs=r1 stack= before 2: (bf) r1 = r0") __msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") __flag(BPF_F_TEST_STATE_FREQ) __naked void precision_stack(void) @@ -283,7 +218,8 @@ void precision_stack__foo(void) */ "*(u64*)(r10 - 8) = r1;" "*(u64*)(r10 - 16) = r1;" - /* force r1 to be precise, this immediately marks: + "if r1 > 7 goto +0;" + /* force r1 to be precise, this eventually marks: * - foo frame r1,fp{-8,-16} * - main frame r1,fp{-8} */ @@ -299,15 +235,17 @@ void precision_stack__foo(void) SEC("socket") __success __log_level(2) /* r{6,7} */ -__msg("11: (0f) r3 += r7") -__msg("frame0: regs=r6,r7 stack= before 10: (bf) r3 = r10") +__msg("12: (0f) r3 += r7") +__msg("frame0: regs=r7 stack= before 11: (bf) r3 = r10") +__msg("frame0: regs=r7 stack= before 9: (25) if r7 > 0x7 goto pc+0") /* ... skip some insns ... */ __msg("frame0: regs=r6,r7 stack= before 3: (bf) r7 = r0") __msg("frame0: regs=r0,r6 stack= before 2: (bf) r6 = r0") /* r{8,9} */ -__msg("12: (0f) r3 += r9") -__msg("frame0: regs=r8,r9 stack= before 11: (0f) r3 += r7") +__msg("13: (0f) r3 += r9") +__msg("frame0: regs=r9 stack= before 12: (0f) r3 += r7") /* ... skip some insns ... */ +__msg("frame0: regs=r9 stack= before 10: (25) if r9 > 0x7 goto pc+0") __msg("frame0: regs=r8,r9 stack= before 7: (bf) r9 = r0") __msg("frame0: regs=r0,r8 stack= before 6: (bf) r8 = r0") __flag(BPF_F_TEST_STATE_FREQ) @@ -328,8 +266,9 @@ __naked void precision_two_ids(void) "r9 = r0;" /* clear r0 id */ "r0 = 0;" - /* force checkpoint */ - "goto +0;" + /* propagate equal scalars precision */ + "if r7 > 7 goto +0;" + "if r9 > 7 goto +0;" "r3 = r10;" /* force r7 to be precise, this also marks r6 */ "r3 += r7;" diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 64d722199e8f..59a020c35647 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -106,7 +106,7 @@ mark_precise: frame0: regs=r2 stack= before 22\ mark_precise: frame0: parent state regs=r2 stack=:\ mark_precise: frame0: last_idx 20 first_idx 20\ - mark_precise: frame0: regs=r2,r9 stack= before 20\ + mark_precise: frame0: regs=r2 stack= before 20\ mark_precise: frame0: parent state regs=r2,r9 stack=:\ mark_precise: frame0: last_idx 19 first_idx 17\ mark_precise: frame0: regs=r2,r9 stack= before 19\ @@ -183,10 +183,10 @@ .prog_type = BPF_PROG_TYPE_XDP, .flags = BPF_F_TEST_STATE_FREQ, .errstr = "mark_precise: frame0: last_idx 7 first_idx 7\ - mark_precise: frame0: parent state regs=r4 stack=-8:\ + mark_precise: frame0: parent state regs=r4 stack=:\ mark_precise: frame0: last_idx 6 first_idx 4\ - mark_precise: frame0: regs=r4 stack=-8 before 6: (b7) r0 = -1\ - mark_precise: frame0: regs=r4 stack=-8 before 5: (79) r4 = *(u64 *)(r10 -8)\ + mark_precise: frame0: regs=r4 stack= before 6: (b7) r0 = -1\ + mark_precise: frame0: regs=r4 stack= before 5: (79) r4 = *(u64 *)(r10 -8)\ mark_precise: frame0: regs= stack=-8 before 4: (7b) *(u64 *)(r3 -8) = r0\ mark_precise: frame0: parent state regs=r0 stack=:\ mark_precise: frame0: last_idx 3 first_idx 3\ From bebc17b1c03b224a0b4aec6a171815e39f8ba9bc Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:55 -0700 Subject: [PATCH 0205/1052] selftests/bpf: Tests for per-insn sync_linked_regs() precision tracking Add a few test cases to verify precision tracking for scalars gaining range because of sync_linked_regs(): - check what happens when more than 6 registers might gain range in sync_linked_regs(); - check if precision is propagated correctly when operand of conditional jump gained range in sync_linked_regs() and one of linked registers is marked precise; - check if precision is propagated correctly when operand of conditional jump gained range in sync_linked_regs() and a other-linked operand of the conditional jump is marked precise; - add a minimized reproducer for precision tracking bug reported in [0]; - Check that mark_chain_precision() for one of the conditional jump operands does not trigger equal scalars precision propagation. [0] https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/ Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-4-eddyz87@gmail.com --- .../selftests/bpf/progs/verifier_scalar_ids.c | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index b642d5c24154..2ecf77b623e0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -47,6 +47,72 @@ __naked void linked_regs_bpf_k(void) : __clobber_all); } +/* Registers r{0,1,2} share same ID when 'if r1 > ...' insn is processed, + * check that verifier marks r{1,2} as precise while backtracking + * 'if r1 > ...' with r0 already marked. + */ +SEC("socket") +__success __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__msg("frame0: regs=r0 stack= before 5: (2d) if r1 > r3 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3 stack=:") +__msg("frame0: regs=r0,r1,r2,r3 stack= before 4: (b7) r3 = 7") +__naked void linked_regs_bpf_x_src(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r0.id == r1.id == r2.id */ + "r1 = r0;" + "r2 = r0;" + "r3 = 7;" + "if r1 > r3 goto +0;" + /* force r0 to be precise, this eventually marks r1 and r2 as + * precise as well because of shared IDs + */ + "r4 = r10;" + "r4 += r0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +/* Registers r{0,1,2} share same ID when 'if r1 > r3' insn is processed, + * check that verifier marks r{0,1,2} as precise while backtracking + * 'if r1 > r3' with r3 already marked. + */ +SEC("socket") +__success __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__msg("frame0: regs=r3 stack= before 5: (2d) if r1 > r3 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3 stack=:") +__msg("frame0: regs=r0,r1,r2,r3 stack= before 4: (b7) r3 = 7") +__naked void linked_regs_bpf_x_dst(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r0.id == r1.id == r2.id */ + "r1 = r0;" + "r2 = r0;" + "r3 = 7;" + "if r1 > r3 goto +0;" + /* force r0 to be precise, this eventually marks r1 and r2 as + * precise as well because of shared IDs + */ + "r4 = r10;" + "r4 += r3;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + /* Same as linked_regs_bpf_k, but break one of the * links, note that r1 is absent from regs=... in __msg below. */ @@ -280,6 +346,105 @@ __naked void precision_two_ids(void) : __clobber_all); } +SEC("socket") +__success __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +/* check thar r0 and r6 have different IDs after 'if', + * collect_linked_regs() can't tie more than 6 registers for a single insn. + */ +__msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") +__msg("9: (bf) r6 = r6 ; R6_w=scalar(id=2") +/* check that r{0-5} are marked precise after 'if' */ +__msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") +__naked void linked_regs_too_many_regs(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r{0-6} IDs */ + "r1 = r0;" + "r2 = r0;" + "r3 = r0;" + "r4 = r0;" + "r5 = r0;" + "r6 = r0;" + /* propagate range for r{0-6} */ + "if r0 > 7 goto +0;" + /* make r6 appear in the log */ + "r6 = r6;" + /* force r0 to be precise, + * this would cause r{0-4} to be precise because of shared IDs + */ + "r7 = r10;" + "r7 += r0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__failure __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__msg("regs=r7 stack= before 5: (3d) if r8 >= r0") +__msg("parent state regs=r0,r7,r8") +__msg("regs=r0,r7,r8 stack= before 4: (25) if r0 > 0x1") +__msg("div by zero") +__naked void linked_regs_broken_link_2(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r7 = r0;" + "r8 = r0;" + "call %[bpf_get_prandom_u32];" + "if r0 > 1 goto +0;" + /* r7.id == r8.id, + * thus r7 precision implies r8 precision, + * which implies r0 precision because of the conditional below. + */ + "if r8 >= r0 goto 1f;" + /* break id relation between r7 and r8 */ + "r8 += r8;" + /* make r7 precise */ + "if r7 == 0 goto 1f;" + "r0 /= 0;" +"1:" + "r0 = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Check that mark_chain_precision() for one of the conditional jump + * operands does not trigger equal scalars precision propagation. + */ +SEC("socket") +__success __log_level(2) +__msg("3: (25) if r1 > 0x100 goto pc+0") +__msg("frame0: regs=r1 stack= before 2: (bf) r1 = r0") +__naked void cjmp_no_linked_regs_trigger(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r0.id == r1.id */ + "r1 = r0;" + /* the jump below would be predicted, thus r1 would be marked precise, + * this should not imply precision mark for r0 + */ + "if r1 > 256 goto +0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + /* Verify that check_ids() is used by regsafe() for scalars. * * r9 = ... some pointer with range X ... From cfbf25481d6dec0089c99c9d33a2ea634fe8f008 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:56 -0700 Subject: [PATCH 0206/1052] selftests/bpf: Update comments find_equal_scalars->sync_linked_regs find_equal_scalars() is renamed to sync_linked_regs(), this commit updates existing references in the selftests comments. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-5-eddyz87@gmail.com --- .../selftests/bpf/progs/verifier_spill_fill.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 85e48069c9e6..9d288ec7a168 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -402,7 +402,7 @@ __naked void spill_32bit_of_64bit_fail(void) *(u32*)(r10 - 8) = r1; \ /* 32-bit fill r2 from stack. */ \ r2 = *(u32*)(r10 - 8); \ - /* Compare r2 with another register to trigger find_equal_scalars.\ + /* Compare r2 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. If the ID was mistakenly preserved on spill, this would\ * cause the verifier to think that r1 is also equal to zero in one of\ @@ -441,7 +441,7 @@ __naked void spill_16bit_of_32bit_fail(void) *(u16*)(r10 - 8) = r1; \ /* 16-bit fill r2 from stack. */ \ r2 = *(u16*)(r10 - 8); \ - /* Compare r2 with another register to trigger find_equal_scalars.\ + /* Compare r2 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. If the ID was mistakenly preserved on spill, this would\ * cause the verifier to think that r1 is also equal to zero in one of\ @@ -833,7 +833,7 @@ __naked void spill_64bit_of_64bit_ok(void) *(u64*)(r10 - 8) = r0; \ /* 64-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u64*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -866,7 +866,7 @@ __naked void spill_32bit_of_32bit_ok(void) *(u32*)(r10 - 8) = r0; \ /* 32-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u32*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -899,7 +899,7 @@ __naked void spill_16bit_of_16bit_ok(void) *(u16*)(r10 - 8) = r0; \ /* 16-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u16*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -932,7 +932,7 @@ __naked void spill_8bit_of_8bit_ok(void) *(u8*)(r10 - 8) = r0; \ /* 8-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u8*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -1029,7 +1029,7 @@ __naked void fill_32bit_after_spill_64bit_preserve_id(void) "r1 = *(u32*)(r10 - 4);" #endif " \ - /* Compare r1 with another register to trigger find_equal_scalars. */\ + /* Compare r1 with another register to trigger sync_linked_regs. */\ r2 = 0; \ if r1 != r2 goto l0_%=; \ /* The result of this comparison is predefined. */\ @@ -1070,7 +1070,7 @@ __naked void fill_32bit_after_spill_64bit_clear_id(void) "r2 = *(u32*)(r10 - 4);" #endif " \ - /* Compare r2 with another register to trigger find_equal_scalars.\ + /* Compare r2 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. If the ID was mistakenly preserved on fill, this would\ * cause the verifier to think that r1 is also equal to zero in one of\ From 116e04ba1459fc08f80cf27b8c9f9f188be0fcb2 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 14 Jul 2024 20:39:00 +0800 Subject: [PATCH 0207/1052] bpf, x64: Fix tailcall hierarchy This patch fixes a tailcall issue caused by abusing the tailcall in bpf2bpf feature. As we know, tail_call_cnt propagates by rax from caller to callee when to call subprog in tailcall context. But, like the following example, MAX_TAIL_CALL_CNT won't work because of missing tail_call_cnt back-propagation from callee to caller. \#include \#include \#include "bpf_legacy.h" struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); __uint(max_entries, 1); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); int count = 0; static __noinline int subprog_tail1(struct __sk_buff *skb) { bpf_tail_call_static(skb, &jmp_table, 0); return 0; } static __noinline int subprog_tail2(struct __sk_buff *skb) { bpf_tail_call_static(skb, &jmp_table, 0); return 0; } SEC("tc") int entry(struct __sk_buff *skb) { volatile int ret = 1; count++; subprog_tail1(skb); subprog_tail2(skb); return ret; } char __license[] SEC("license") = "GPL"; At run time, the tail_call_cnt in entry() will be propagated to subprog_tail1() and subprog_tail2(). But, when the tail_call_cnt in subprog_tail1() updates when bpf_tail_call_static(), the tail_call_cnt in entry() won't be updated at the same time. As a result, in entry(), when tail_call_cnt in entry() is less than MAX_TAIL_CALL_CNT and subprog_tail1() returns because of MAX_TAIL_CALL_CNT limit, bpf_tail_call_static() in suprog_tail2() is able to run because the tail_call_cnt in subprog_tail2() propagated from entry() is less than MAX_TAIL_CALL_CNT. So, how many tailcalls are there for this case if no error happens? From top-down view, does it look like hierarchy layer and layer? With this view, there will be 2+4+8+...+2^33 = 2^34 - 2 = 17,179,869,182 tailcalls for this case. How about there are N subprog_tail() in entry()? There will be almost N^34 tailcalls. Then, in this patch, it resolves this case on x86_64. In stead of propagating tail_call_cnt from caller to callee, it propagates its pointer, tail_call_cnt_ptr, tcc_ptr for short. However, where does it store tail_call_cnt? It stores tail_call_cnt on the stack of main prog. When tail call happens in subprog, it increments tail_call_cnt by tcc_ptr. Meanwhile, it stores tail_call_cnt_ptr on the stack of main prog, too. And, before jump to tail callee, it has to pop tail_call_cnt and tail_call_cnt_ptr. Then, at the prologue of subprog, it must not make rax as tail_call_cnt_ptr again. It has to reuse tail_call_cnt_ptr from caller. As a result, at run time, it has to recognize rax is tail_call_cnt or tail_call_cnt_ptr at prologue by: 1. rax is tail_call_cnt if rax is <= MAX_TAIL_CALL_CNT. 2. rax is tail_call_cnt_ptr if rax is > MAX_TAIL_CALL_CNT, because a pointer won't be <= MAX_TAIL_CALL_CNT. Here's an example to dump JITed. struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); __uint(max_entries, 1); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); int count = 0; static __noinline int subprog_tail(struct __sk_buff *skb) { bpf_tail_call_static(skb, &jmp_table, 0); return 0; } SEC("tc") int entry(struct __sk_buff *skb) { int ret = 1; count++; subprog_tail(skb); subprog_tail(skb); return ret; } When bpftool p d j id 42: int entry(struct __sk_buff * skb): bpf_prog_0c0f4c2413ef19b1_entry: ; int entry(struct __sk_buff *skb) 0: endbr64 4: nopl (%rax,%rax) 9: xorq %rax, %rax ;; rax = 0 (tail_call_cnt) c: pushq %rbp d: movq %rsp, %rbp 10: endbr64 14: cmpq $33, %rax ;; if rax > 33, rax = tcc_ptr 18: ja 0x20 ;; if rax > 33 goto 0x20 ---+ 1a: pushq %rax ;; [rbp - 8] = rax = 0 | 1b: movq %rsp, %rax ;; rax = rbp - 8 | 1e: jmp 0x21 ;; ---------+ | 20: pushq %rax ;; <--------|---------------+ 21: pushq %rax ;; <--------+ [rbp - 16] = rax 22: pushq %rbx ;; callee saved 23: movq %rdi, %rbx ;; rbx = skb (callee saved) ; count++; 26: movabsq $-82417199407104, %rdi 30: movl (%rdi), %esi 33: addl $1, %esi 36: movl %esi, (%rdi) ; subprog_tail(skb); 39: movq %rbx, %rdi ;; rdi = skb 3c: movq -16(%rbp), %rax ;; rax = tcc_ptr 43: callq 0x80 ;; call subprog_tail() ; subprog_tail(skb); 48: movq %rbx, %rdi ;; rdi = skb 4b: movq -16(%rbp), %rax ;; rax = tcc_ptr 52: callq 0x80 ;; call subprog_tail() ; return ret; 57: movl $1, %eax 5c: popq %rbx 5d: leave 5e: retq int subprog_tail(struct __sk_buff * skb): bpf_prog_3a140cef239a4b4f_subprog_tail: ; int subprog_tail(struct __sk_buff *skb) 0: endbr64 4: nopl (%rax,%rax) 9: nopl (%rax) ;; do not touch tail_call_cnt c: pushq %rbp d: movq %rsp, %rbp 10: endbr64 14: pushq %rax ;; [rbp - 8] = rax (tcc_ptr) 15: pushq %rax ;; [rbp - 16] = rax (tcc_ptr) 16: pushq %rbx ;; callee saved 17: pushq %r13 ;; callee saved 19: movq %rdi, %rbx ;; rbx = skb ; asm volatile("r1 = %[ctx]\n\t" 1c: movabsq $-105487587488768, %r13 ;; r13 = jmp_table 26: movq %rbx, %rdi ;; 1st arg, skb 29: movq %r13, %rsi ;; 2nd arg, jmp_table 2c: xorl %edx, %edx ;; 3rd arg, index = 0 2e: movq -16(%rbp), %rax ;; rax = [rbp - 16] (tcc_ptr) 35: cmpq $33, (%rax) 39: jae 0x4e ;; if *tcc_ptr >= 33 goto 0x4e --------+ 3b: jmp 0x4e ;; jmp bypass, toggled by poking | 40: addq $1, (%rax) ;; (*tcc_ptr)++ | 44: popq %r13 ;; callee saved | 46: popq %rbx ;; callee saved | 47: popq %rax ;; undo rbp-16 push | 48: popq %rax ;; undo rbp-8 push | 49: nopl (%rax,%rax) ;; tail call target, toggled by poking | ; return 0; ;; | 4e: popq %r13 ;; restore callee saved <--------------+ 50: popq %rbx ;; restore callee saved 51: leave 52: retq Furthermore, when trampoline is the caller of bpf prog, which is tail_call_reachable, it is required to propagate rax through trampoline. Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") Reviewed-by: Eduard Zingerman Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240714123902.32305-2-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- arch/x86/net/bpf_jit_comp.c | 107 ++++++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 28 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d25d81c8ecc0..074b41fafbe3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -273,7 +273,7 @@ struct jit_context { /* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 /* Number of bytes that will be skipped on tailcall */ -#define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE) +#define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE) static void push_r12(u8 **pprog) { @@ -403,6 +403,37 @@ static void emit_cfi(u8 **pprog, u32 hash) *pprog = prog; } +static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) +{ + u8 *prog = *pprog; + + if (!is_subprog) { + /* cmp rax, MAX_TAIL_CALL_CNT */ + EMIT4(0x48, 0x83, 0xF8, MAX_TAIL_CALL_CNT); + EMIT2(X86_JA, 6); /* ja 6 */ + /* rax is tail_call_cnt if <= MAX_TAIL_CALL_CNT. + * case1: entry of main prog. + * case2: tail callee of main prog. + */ + EMIT1(0x50); /* push rax */ + /* Make rax as tail_call_cnt_ptr. */ + EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */ + EMIT2(0xEB, 1); /* jmp 1 */ + /* rax is tail_call_cnt_ptr if > MAX_TAIL_CALL_CNT. + * case: tail callee of subprog. + */ + EMIT1(0x50); /* push rax */ + /* push tail_call_cnt_ptr */ + EMIT1(0x50); /* push rax */ + } else { /* is_subprog */ + /* rax is tail_call_cnt_ptr. */ + EMIT1(0x50); /* push rax */ + EMIT1(0x50); /* push rax */ + } + + *pprog = prog; +} + /* * Emit x86-64 prologue code for BPF program. * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes @@ -424,10 +455,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, /* When it's the entry of the whole tailcall context, * zeroing rax means initialising tail_call_cnt. */ - EMIT2(0x31, 0xC0); /* xor eax, eax */ + EMIT3(0x48, 0x31, 0xC0); /* xor rax, rax */ else /* Keep the same instruction layout. */ - EMIT2(0x66, 0x90); /* nop2 */ + emit_nops(&prog, 3); /* nop3 */ } /* Exception callback receives FP as third parameter */ if (is_exception_cb) { @@ -453,7 +484,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, if (stack_depth) EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); if (tail_call_reachable) - EMIT1(0x50); /* push rax */ + emit_prologue_tail_call(&prog, is_subprog); *pprog = prog; } @@ -589,13 +620,15 @@ static void emit_return(u8 **pprog, u8 *ip) *pprog = prog; } +#define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack) (-16 - round_up(stack, 8)) + /* * Generate the following code: * * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... * if (index >= array->map.max_entries) * goto out; - * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) + * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT) * goto out; * prog = array->ptrs[index]; * if (prog == NULL) @@ -608,7 +641,7 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, u32 stack_depth, u8 *ip, struct jit_context *ctx) { - int tcc_off = -4 - round_up(stack_depth, 8); + int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth); u8 *prog = *pprog, *start = *pprog; int offset; @@ -630,16 +663,14 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, EMIT2(X86_JBE, offset); /* jbe out */ /* - * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) + * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ - EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ + EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off); /* mov rax, qword ptr [rbp - tcc_ptr_off] */ + EMIT4(0x48, 0x83, 0x38, MAX_TAIL_CALL_CNT); /* cmp qword ptr [rax], MAX_TAIL_CALL_CNT */ offset = ctx->tail_call_indirect_label - (prog + 2 - start); EMIT2(X86_JAE, offset); /* jae out */ - EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */ @@ -654,6 +685,9 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, offset = ctx->tail_call_indirect_label - (prog + 2 - start); EMIT2(X86_JE, offset); /* je out */ + /* Inc tail_call_cnt if the slot is populated. */ + EMIT4(0x48, 0x83, 0x00, 0x01); /* add qword ptr [rax], 1 */ + if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); @@ -663,6 +697,11 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, pop_r12(&prog); } + /* Pop tail_call_cnt_ptr. */ + EMIT1(0x58); /* pop rax */ + /* Pop tail_call_cnt, if it's main prog. + * Pop tail_call_cnt_ptr, if it's subprog. + */ EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */ @@ -691,21 +730,19 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, bool *callee_regs_used, u32 stack_depth, struct jit_context *ctx) { - int tcc_off = -4 - round_up(stack_depth, 8); + int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth); u8 *prog = *pprog, *start = *pprog; int offset; /* - * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) + * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ - EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ + EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off); /* mov rax, qword ptr [rbp - tcc_ptr_off] */ + EMIT4(0x48, 0x83, 0x38, MAX_TAIL_CALL_CNT); /* cmp qword ptr [rax], MAX_TAIL_CALL_CNT */ offset = ctx->tail_call_direct_label - (prog + 2 - start); EMIT2(X86_JAE, offset); /* jae out */ - EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ poke->tailcall_bypass = ip + (prog - start); poke->adj_off = X86_TAIL_CALL_OFFSET; @@ -715,6 +752,9 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, poke->tailcall_bypass); + /* Inc tail_call_cnt if the slot is populated. */ + EMIT4(0x48, 0x83, 0x00, 0x01); /* add qword ptr [rax], 1 */ + if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); @@ -724,6 +764,11 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, pop_r12(&prog); } + /* Pop tail_call_cnt_ptr. */ + EMIT1(0x58); /* pop rax */ + /* Pop tail_call_cnt, if it's main prog. + * Pop tail_call_cnt_ptr, if it's subprog. + */ EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); @@ -1311,9 +1356,11 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) -/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ -#define RESTORE_TAIL_CALL_CNT(stack) \ - EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8) +#define __LOAD_TCC_PTR(off) \ + EMIT3_off32(0x48, 0x8B, 0x85, off) +/* mov rax, qword ptr [rbp - rounded_stack_depth - 16] */ +#define LOAD_TAIL_CALL_CNT_PTR(stack) \ + __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack)) static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) @@ -2031,7 +2078,7 @@ st: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { - RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); + LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth); ip += 7; } if (!imm32) @@ -2706,6 +2753,10 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } +/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ +#define LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack) \ + __LOAD_TCC_PTR(-round_up(stack, 8) - 8) + /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -2826,7 +2877,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * [ ... ] * [ stack_arg2 ] * RBP - arg_stack_off [ stack_arg1 ] - * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX + * RSP [ tail_call_cnt_ptr ] BPF_TRAMP_F_TAIL_CALL_CTX */ /* room for return value of orig_call or fentry prog */ @@ -2955,10 +3006,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im save_args(m, &prog, arg_stack_off, true); if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { - /* Before calling the original function, restore the - * tail_call_cnt from stack to rax. + /* Before calling the original function, load the + * tail_call_cnt_ptr from stack to rax. */ - RESTORE_TAIL_CALL_CNT(stack_size); + LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack_size); } if (flags & BPF_TRAMP_F_ORIG_STACK) { @@ -3017,10 +3068,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im goto cleanup; } } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { - /* Before running the original function, restore the - * tail_call_cnt from stack to rax. + /* Before running the original function, load the + * tail_call_cnt_ptr from stack to rax. */ - RESTORE_TAIL_CALL_CNT(stack_size); + LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack_size); } /* restore return value of orig_call or fentry prog back into RAX */ From 66ff4d61dc124eafe9efaeaef696a09b7f236da2 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 14 Jul 2024 20:39:01 +0800 Subject: [PATCH 0208/1052] bpf, arm64: Fix tailcall hierarchy This patch fixes a tailcall issue caused by abusing the tailcall in bpf2bpf feature on arm64 like the way of "bpf, x64: Fix tailcall hierarchy". On arm64, when a tail call happens, it uses tail_call_cnt_ptr to increment tail_call_cnt, too. At the prologue of main prog, it has to initialize tail_call_cnt and prepare tail_call_cnt_ptr. At the prologue of subprog, it pushes x26 register twice, and does not initialize tail_call_cnt. At the epilogue, it pops x26 twice, no matter whether it is main prog or subprog. Fixes: d4609a5d8c70 ("bpf, arm64: Keep tail call count across bpf2bpf calls") Acked-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240714123902.32305-3-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- arch/arm64/net/bpf_jit_comp.c | 57 +++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index dd0bb069df4b..59e05a7aea56 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -26,7 +26,7 @@ #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) -#define TCALL_CNT (MAX_BPF_JIT_REG + 2) +#define TCCNT_PTR (MAX_BPF_JIT_REG + 2) #define TMP_REG_3 (MAX_BPF_JIT_REG + 3) #define FP_BOTTOM (MAX_BPF_JIT_REG + 4) #define ARENA_VM_START (MAX_BPF_JIT_REG + 5) @@ -63,8 +63,8 @@ static const int bpf2a64[] = { [TMP_REG_1] = A64_R(10), [TMP_REG_2] = A64_R(11), [TMP_REG_3] = A64_R(12), - /* tail_call_cnt */ - [TCALL_CNT] = A64_R(26), + /* tail_call_cnt_ptr */ + [TCCNT_PTR] = A64_R(26), /* temporary register for blinding constants */ [BPF_REG_AX] = A64_R(9), [FP_BOTTOM] = A64_R(27), @@ -282,13 +282,35 @@ static bool is_lsi_offset(int offset, int scale) * mov x29, sp * stp x19, x20, [sp, #-16]! * stp x21, x22, [sp, #-16]! - * stp x25, x26, [sp, #-16]! + * stp x26, x25, [sp, #-16]! + * stp x26, x25, [sp, #-16]! * stp x27, x28, [sp, #-16]! * mov x25, sp * mov tcc, #0 * // PROLOGUE_OFFSET */ +static void prepare_bpf_tail_call_cnt(struct jit_ctx *ctx) +{ + const struct bpf_prog *prog = ctx->prog; + const bool is_main_prog = !bpf_is_subprog(prog); + const u8 ptr = bpf2a64[TCCNT_PTR]; + const u8 fp = bpf2a64[BPF_REG_FP]; + const u8 tcc = ptr; + + emit(A64_PUSH(ptr, fp, A64_SP), ctx); + if (is_main_prog) { + /* Initialize tail_call_cnt. */ + emit(A64_MOVZ(1, tcc, 0, 0), ctx); + emit(A64_PUSH(tcc, fp, A64_SP), ctx); + emit(A64_MOV(1, ptr, A64_SP), ctx); + } else { + emit(A64_PUSH(ptr, fp, A64_SP), ctx); + emit(A64_NOP, ctx); + emit(A64_NOP, ctx); + } +} + #define BTI_INSNS (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) ? 1 : 0) #define PAC_INSNS (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) ? 1 : 0) @@ -296,7 +318,7 @@ static bool is_lsi_offset(int offset, int scale) #define POKE_OFFSET (BTI_INSNS + 1) /* Tail call offset to jump into */ -#define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 8) +#define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 10) static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, bool is_exception_cb, u64 arena_vm_start) @@ -308,7 +330,6 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, const u8 r8 = bpf2a64[BPF_REG_8]; const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; - const u8 tcc = bpf2a64[TCALL_CNT]; const u8 fpb = bpf2a64[FP_BOTTOM]; const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const int idx0 = ctx->idx; @@ -359,7 +380,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, /* Save callee-saved registers */ emit(A64_PUSH(r6, r7, A64_SP), ctx); emit(A64_PUSH(r8, r9, A64_SP), ctx); - emit(A64_PUSH(fp, tcc, A64_SP), ctx); + prepare_bpf_tail_call_cnt(ctx); emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx); } else { /* @@ -372,18 +393,15 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, * callee-saved registers. The exception callback will not push * anything and re-use the main program's stack. * - * 10 registers are on the stack + * 12 registers are on the stack */ - emit(A64_SUB_I(1, A64_SP, A64_FP, 80), ctx); + emit(A64_SUB_I(1, A64_SP, A64_FP, 96), ctx); } /* Set up BPF prog stack base register */ emit(A64_MOV(1, fp, A64_SP), ctx); if (!ebpf_from_cbpf && is_main_prog) { - /* Initialize tail_call_cnt */ - emit(A64_MOVZ(1, tcc, 0, 0), ctx); - cur_offset = ctx->idx - idx0; if (cur_offset != PROLOGUE_OFFSET) { pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n", @@ -432,7 +450,8 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) const u8 tmp = bpf2a64[TMP_REG_1]; const u8 prg = bpf2a64[TMP_REG_2]; - const u8 tcc = bpf2a64[TCALL_CNT]; + const u8 tcc = bpf2a64[TMP_REG_3]; + const u8 ptr = bpf2a64[TCCNT_PTR]; const int idx0 = ctx->idx; #define cur_offset (ctx->idx - idx0) #define jmp_offset (out_offset - (cur_offset)) @@ -449,11 +468,12 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_B_(A64_COND_CS, jmp_offset), ctx); /* - * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) + * if ((*tail_call_cnt_ptr) >= MAX_TAIL_CALL_CNT) * goto out; - * tail_call_cnt++; + * (*tail_call_cnt_ptr)++; */ emit_a64_mov_i64(tmp, MAX_TAIL_CALL_CNT, ctx); + emit(A64_LDR64I(tcc, ptr, 0), ctx); emit(A64_CMP(1, tcc, tmp), ctx); emit(A64_B_(A64_COND_CS, jmp_offset), ctx); emit(A64_ADD_I(1, tcc, tcc, 1), ctx); @@ -469,6 +489,9 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_LDR64(prg, tmp, prg), ctx); emit(A64_CBZ(1, prg, jmp_offset), ctx); + /* Update tail_call_cnt if the slot is populated. */ + emit(A64_STR64I(tcc, ptr, 0), ctx); + /* goto *(prog->bpf_func + prologue_offset); */ off = offsetof(struct bpf_prog, bpf_func); emit_a64_mov_i64(tmp, off, ctx); @@ -721,6 +744,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) const u8 r8 = bpf2a64[BPF_REG_8]; const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; + const u8 ptr = bpf2a64[TCCNT_PTR]; const u8 fpb = bpf2a64[FP_BOTTOM]; /* We're done with BPF stack */ @@ -738,7 +762,8 @@ static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) /* Restore x27 and x28 */ emit(A64_POP(fpb, A64_R(28), A64_SP), ctx); /* Restore fs (x25) and x26 */ - emit(A64_POP(fp, A64_R(26), A64_SP), ctx); + emit(A64_POP(ptr, fp, A64_SP), ctx); + emit(A64_POP(ptr, fp, A64_SP), ctx); /* Restore callee-saved register */ emit(A64_POP(r8, r9, A64_SP), ctx); From b83b936f3e9a3c63896852198a1814e90e68eef5 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 14 Jul 2024 20:39:02 +0800 Subject: [PATCH 0209/1052] selftests/bpf: Add testcases for tailcall hierarchy fixing Add some test cases to confirm the tailcall hierarchy issue has been fixed. On x64, the selftests result is: cd tools/testing/selftests/bpf && ./test_progs -t tailcalls 327/18 tailcalls/tailcall_bpf2bpf_hierarchy_1:OK 327/19 tailcalls/tailcall_bpf2bpf_hierarchy_fentry:OK 327/20 tailcalls/tailcall_bpf2bpf_hierarchy_fexit:OK 327/21 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_fexit:OK 327/22 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_entry:OK 327/23 tailcalls/tailcall_bpf2bpf_hierarchy_2:OK 327/24 tailcalls/tailcall_bpf2bpf_hierarchy_3:OK 327 tailcalls:OK Summary: 1/24 PASSED, 0 SKIPPED, 0 FAILED On arm64, the selftests result is: cd tools/testing/selftests/bpf && ./test_progs -t tailcalls 327/18 tailcalls/tailcall_bpf2bpf_hierarchy_1:OK 327/19 tailcalls/tailcall_bpf2bpf_hierarchy_fentry:OK 327/20 tailcalls/tailcall_bpf2bpf_hierarchy_fexit:OK 327/21 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_fexit:OK 327/22 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_entry:OK 327/23 tailcalls/tailcall_bpf2bpf_hierarchy_2:OK 327/24 tailcalls/tailcall_bpf2bpf_hierarchy_3:OK 327 tailcalls:OK Summary: 1/24 PASSED, 0 SKIPPED, 0 FAILED Acked-by: Eduard Zingerman Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240714123902.32305-4-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/tailcalls.c | 320 ++++++++++++++++++ .../bpf/progs/tailcall_bpf2bpf_hierarchy1.c | 34 ++ .../bpf/progs/tailcall_bpf2bpf_hierarchy2.c | 70 ++++ .../bpf/progs/tailcall_bpf2bpf_hierarchy3.c | 62 ++++ .../progs/tailcall_bpf2bpf_hierarchy_fentry.c | 35 ++ tools/testing/selftests/bpf/progs/tc_dummy.c | 12 + 6 files changed, 533 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c create mode 100644 tools/testing/selftests/bpf/progs/tc_dummy.c diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 59993fc9c0d7..e01fabb8cc41 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -3,6 +3,8 @@ #include #include #include "tailcall_poke.skel.h" +#include "tailcall_bpf2bpf_hierarchy2.skel.h" +#include "tailcall_bpf2bpf_hierarchy3.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations @@ -1187,6 +1189,312 @@ static void test_tailcall_poke(void) tailcall_poke__destroy(call); } +static void test_tailcall_hierarchy_count(const char *which, bool test_fentry, + bool test_fexit, + bool test_fentry_entry) +{ + int err, map_fd, prog_fd, main_data_fd, fentry_data_fd, fexit_data_fd, i, val; + struct bpf_object *obj = NULL, *fentry_obj = NULL, *fexit_obj = NULL; + struct bpf_link *fentry_link = NULL, *fexit_link = NULL; + struct bpf_program *prog, *fentry_prog; + struct bpf_map *prog_array, *data_map; + int fentry_prog_fd; + char buff[128] = {}; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = buff, + .data_size_in = sizeof(buff), + .repeat = 1, + ); + + err = bpf_prog_test_load(which, BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (!ASSERT_OK(err, "load obj")) + return; + + prog = bpf_object__find_program_by_name(obj, "entry"); + if (!ASSERT_OK_PTR(prog, "find entry prog")) + goto out; + + prog_fd = bpf_program__fd(prog); + if (!ASSERT_GE(prog_fd, 0, "prog_fd")) + goto out; + + if (test_fentry_entry) { + fentry_obj = bpf_object__open_file("tailcall_bpf2bpf_hierarchy_fentry.bpf.o", + NULL); + if (!ASSERT_OK_PTR(fentry_obj, "open fentry_obj file")) + goto out; + + fentry_prog = bpf_object__find_program_by_name(fentry_obj, + "fentry"); + if (!ASSERT_OK_PTR(prog, "find fentry prog")) + goto out; + + err = bpf_program__set_attach_target(fentry_prog, prog_fd, + "entry"); + if (!ASSERT_OK(err, "set_attach_target entry")) + goto out; + + err = bpf_object__load(fentry_obj); + if (!ASSERT_OK(err, "load fentry_obj")) + goto out; + + fentry_link = bpf_program__attach_trace(fentry_prog); + if (!ASSERT_OK_PTR(fentry_link, "attach_trace")) + goto out; + + fentry_prog_fd = bpf_program__fd(fentry_prog); + if (!ASSERT_GE(fentry_prog_fd, 0, "fentry_prog_fd")) + goto out; + + prog_array = bpf_object__find_map_by_name(fentry_obj, "jmp_table"); + if (!ASSERT_OK_PTR(prog_array, "find jmp_table")) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (!ASSERT_GE(map_fd, 0, "map_fd")) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &fentry_prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + data_map = bpf_object__find_map_by_name(fentry_obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find data_map")) + goto out; + + } else { + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (!ASSERT_OK_PTR(prog_array, "find jmp_table")) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (!ASSERT_GE(map_fd, 0, "map_fd")) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + data_map = bpf_object__find_map_by_name(obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find data_map")) + goto out; + } + + if (test_fentry) { + fentry_obj = bpf_object__open_file("tailcall_bpf2bpf_fentry.bpf.o", + NULL); + if (!ASSERT_OK_PTR(fentry_obj, "open fentry_obj file")) + goto out; + + prog = bpf_object__find_program_by_name(fentry_obj, "fentry"); + if (!ASSERT_OK_PTR(prog, "find fentry prog")) + goto out; + + err = bpf_program__set_attach_target(prog, prog_fd, + "subprog_tail"); + if (!ASSERT_OK(err, "set_attach_target subprog_tail")) + goto out; + + err = bpf_object__load(fentry_obj); + if (!ASSERT_OK(err, "load fentry_obj")) + goto out; + + fentry_link = bpf_program__attach_trace(prog); + if (!ASSERT_OK_PTR(fentry_link, "attach_trace")) + goto out; + } + + if (test_fexit) { + fexit_obj = bpf_object__open_file("tailcall_bpf2bpf_fexit.bpf.o", + NULL); + if (!ASSERT_OK_PTR(fexit_obj, "open fexit_obj file")) + goto out; + + prog = bpf_object__find_program_by_name(fexit_obj, "fexit"); + if (!ASSERT_OK_PTR(prog, "find fexit prog")) + goto out; + + err = bpf_program__set_attach_target(prog, prog_fd, + "subprog_tail"); + if (!ASSERT_OK(err, "set_attach_target subprog_tail")) + goto out; + + err = bpf_object__load(fexit_obj); + if (!ASSERT_OK(err, "load fexit_obj")) + goto out; + + fexit_link = bpf_program__attach_trace(prog); + if (!ASSERT_OK_PTR(fexit_link, "attach_trace")) + goto out; + } + + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "tailcall"); + ASSERT_EQ(topts.retval, 1, "tailcall retval"); + + main_data_fd = bpf_map__fd(data_map); + if (!ASSERT_GE(main_data_fd, 0, "main_data_fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(main_data_fd, &i, &val); + ASSERT_OK(err, "tailcall count"); + ASSERT_EQ(val, 34, "tailcall count"); + + if (test_fentry) { + data_map = bpf_object__find_map_by_name(fentry_obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find tailcall_bpf2bpf_fentry.bss map")) + goto out; + + fentry_data_fd = bpf_map__fd(data_map); + if (!ASSERT_GE(fentry_data_fd, 0, + "find tailcall_bpf2bpf_fentry.bss map fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(fentry_data_fd, &i, &val); + ASSERT_OK(err, "fentry count"); + ASSERT_EQ(val, 68, "fentry count"); + } + + if (test_fexit) { + data_map = bpf_object__find_map_by_name(fexit_obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find tailcall_bpf2bpf_fexit.bss map")) + goto out; + + fexit_data_fd = bpf_map__fd(data_map); + if (!ASSERT_GE(fexit_data_fd, 0, + "find tailcall_bpf2bpf_fexit.bss map fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(fexit_data_fd, &i, &val); + ASSERT_OK(err, "fexit count"); + ASSERT_EQ(val, 68, "fexit count"); + } + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (!ASSERT_OK(err, "delete_elem from jmp_table")) + goto out; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "tailcall"); + ASSERT_EQ(topts.retval, 1, "tailcall retval"); + + i = 0; + err = bpf_map_lookup_elem(main_data_fd, &i, &val); + ASSERT_OK(err, "tailcall count"); + ASSERT_EQ(val, 35, "tailcall count"); + + if (test_fentry) { + i = 0; + err = bpf_map_lookup_elem(fentry_data_fd, &i, &val); + ASSERT_OK(err, "fentry count"); + ASSERT_EQ(val, 70, "fentry count"); + } + + if (test_fexit) { + i = 0; + err = bpf_map_lookup_elem(fexit_data_fd, &i, &val); + ASSERT_OK(err, "fexit count"); + ASSERT_EQ(val, 70, "fexit count"); + } + +out: + bpf_link__destroy(fentry_link); + bpf_link__destroy(fexit_link); + bpf_object__close(fentry_obj); + bpf_object__close(fexit_obj); + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_hierarchy_1 checks that the count value of the tail + * call limit enforcement matches with expectations when tailcalls are preceded + * with two bpf2bpf calls. + * + * subprog --tailcall-> entry + * entry < + * subprog --tailcall-> entry + */ +static void test_tailcall_bpf2bpf_hierarchy_1(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + false, false, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fentry checks that the count value of the + * tail call limit enforcement matches with expectations when tailcalls are + * preceded with two bpf2bpf calls, and the two subprogs are traced by fentry. + */ +static void test_tailcall_bpf2bpf_hierarchy_fentry(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + true, false, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fexit checks that the count value of the tail + * call limit enforcement matches with expectations when tailcalls are preceded + * with two bpf2bpf calls, and the two subprogs are traced by fexit. + */ +static void test_tailcall_bpf2bpf_hierarchy_fexit(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + false, true, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fentry_fexit checks that the count value of + * the tail call limit enforcement matches with expectations when tailcalls are + * preceded with two bpf2bpf calls, and the two subprogs are traced by both + * fentry and fexit. + */ +static void test_tailcall_bpf2bpf_hierarchy_fentry_fexit(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + true, true, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fentry_entry checks that the count value of + * the tail call limit enforcement matches with expectations when tailcalls are + * preceded with two bpf2bpf calls in fentry. + */ +static void test_tailcall_bpf2bpf_hierarchy_fentry_entry(void) +{ + test_tailcall_hierarchy_count("tc_dummy.bpf.o", false, false, true); +} + +/* test_tailcall_bpf2bpf_hierarchy_2 checks that the count value of the tail + * call limit enforcement matches with expectations: + * + * subprog_tail0 --tailcall-> classifier_0 -> subprog_tail0 + * entry < + * subprog_tail1 --tailcall-> classifier_1 -> subprog_tail1 + */ +static void test_tailcall_bpf2bpf_hierarchy_2(void) +{ + RUN_TESTS(tailcall_bpf2bpf_hierarchy2); +} + +/* test_tailcall_bpf2bpf_hierarchy_3 checks that the count value of the tail + * call limit enforcement matches with expectations: + * + * subprog with jmp_table0 to classifier_0 + * entry --tailcall-> classifier_0 < + * subprog with jmp_table1 to classifier_0 + */ +static void test_tailcall_bpf2bpf_hierarchy_3(void) +{ + RUN_TESTS(tailcall_bpf2bpf_hierarchy3); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1223,4 +1531,16 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_fentry_entry(); if (test__start_subtest("tailcall_poke")) test_tailcall_poke(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_1")) + test_tailcall_bpf2bpf_hierarchy_1(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry")) + test_tailcall_bpf2bpf_hierarchy_fentry(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fexit")) + test_tailcall_bpf2bpf_hierarchy_fexit(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry_fexit")) + test_tailcall_bpf2bpf_hierarchy_fentry_fexit(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry_entry")) + test_tailcall_bpf2bpf_hierarchy_fentry_entry(); + test_tailcall_bpf2bpf_hierarchy_2(); + test_tailcall_bpf2bpf_hierarchy_3(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c new file mode 100644 index 000000000000..327ca395e860 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_legacy.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int count = 0; + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + bpf_tail_call_static(skb, &jmp_table, 0); + return 0; +} + +SEC("tc") +int entry(struct __sk_buff *skb) +{ + int ret = 1; + + count++; + subprog_tail(skb); + subprog_tail(skb); + + return ret; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c new file mode 100644 index 000000000000..37604b0b97af --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" + +int classifier_0(struct __sk_buff *skb); +int classifier_1(struct __sk_buff *skb); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps") = { + .values = { + [0] = (void *) &classifier_0, + [1] = (void *) &classifier_1, + }, +}; + +int count0 = 0; +int count1 = 0; + +static __noinline +int subprog_tail0(struct __sk_buff *skb) +{ + bpf_tail_call_static(skb, &jmp_table, 0); + return 0; +} + +__auxiliary +SEC("tc") +int classifier_0(struct __sk_buff *skb) +{ + count0++; + subprog_tail0(skb); + return 0; +} + +static __noinline +int subprog_tail1(struct __sk_buff *skb) +{ + bpf_tail_call_static(skb, &jmp_table, 1); + return 0; +} + +__auxiliary +SEC("tc") +int classifier_1(struct __sk_buff *skb) +{ + count1++; + subprog_tail1(skb); + return 0; +} + +__success +__retval(33) +SEC("tc") +int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) +{ + volatile int ret = 0; + + subprog_tail0(skb); + subprog_tail1(skb); + + asm volatile (""::"r+"(ret)); + return (count1 << 16) | count0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c new file mode 100644 index 000000000000..0cdbb781fcbc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" + +int classifier_0(struct __sk_buff *skb); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table0 SEC(".maps") = { + .values = { + [0] = (void *) &classifier_0, + }, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table1 SEC(".maps") = { + .values = { + [0] = (void *) &classifier_0, + }, +}; + +int count = 0; + +static __noinline +int subprog_tail(struct __sk_buff *skb, void *jmp_table) +{ + bpf_tail_call_static(skb, jmp_table, 0); + return 0; +} + +__auxiliary +SEC("tc") +int classifier_0(struct __sk_buff *skb) +{ + count++; + subprog_tail(skb, &jmp_table0); + subprog_tail(skb, &jmp_table1); + return count; +} + +__success +__retval(33) +SEC("tc") +int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb) +{ + volatile int ret = 0; + + bpf_tail_call_static(skb, &jmp_table0, 0); + + asm volatile (""::"r+"(ret)); + return ret; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c new file mode 100644 index 000000000000..c87f9ca982d3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Leon Hwang */ + +#include "vmlinux.h" +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int count = 0; + +static __noinline +int subprog_tail(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +SEC("fentry/dummy") +int BPF_PROG(fentry, struct sk_buff *skb) +{ + count++; + subprog_tail(ctx); + subprog_tail(ctx); + + return 0; +} + + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tc_dummy.c b/tools/testing/selftests/bpf/progs/tc_dummy.c new file mode 100644 index 000000000000..69a3d0dc8787 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tc_dummy.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_legacy.h" + +SEC("tc") +int entry(struct __sk_buff *skb) +{ + return 1; +} + +char __license[] SEC("license") = "GPL"; From b7264f87f76c59698c7fd019a8797378fa17e7e3 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:33:50 +0800 Subject: [PATCH 0210/1052] bpftool: Refactor xdp attach/detach type judgment This commit no logical changed, just increases code readability and facilitates TCX prog expansion, which will be implemented in the next patch. Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721143353.95980-2-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- tools/bpf/bpftool/net.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 968714b4c3d4..1b9f4225b394 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -684,10 +684,18 @@ static int do_attach(int argc, char **argv) } } + switch (attach_type) { /* attach xdp prog */ - if (is_prefix("xdp", attach_type_strings[attach_type])) - err = do_attach_detach_xdp(progfd, attach_type, ifindex, - overwrite); + case NET_ATTACH_TYPE_XDP: + case NET_ATTACH_TYPE_XDP_GENERIC: + case NET_ATTACH_TYPE_XDP_DRIVER: + case NET_ATTACH_TYPE_XDP_OFFLOAD: + err = do_attach_detach_xdp(progfd, attach_type, ifindex, overwrite); + break; + default: + break; + } + if (err) { p_err("interface %s attach failed: %s", attach_type_strings[attach_type], strerror(-err)); @@ -721,10 +729,18 @@ static int do_detach(int argc, char **argv) if (ifindex < 1) return -EINVAL; + switch (attach_type) { /* detach xdp prog */ - progfd = -1; - if (is_prefix("xdp", attach_type_strings[attach_type])) + case NET_ATTACH_TYPE_XDP: + case NET_ATTACH_TYPE_XDP_GENERIC: + case NET_ATTACH_TYPE_XDP_DRIVER: + case NET_ATTACH_TYPE_XDP_OFFLOAD: + progfd = -1; err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL); + break; + default: + break; + } if (err < 0) { p_err("interface %s detach failed: %s", From 3b9d4fee8ad32599ba20ab36da7694feef022b48 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:42:21 +0800 Subject: [PATCH 0211/1052] bpftool: Add net attach/detach command to tcx prog Now, attach/detach tcx prog supported in libbpf, so we can add new command 'bpftool attach/detach tcx' to attach tcx prog with bpftool for user. # bpftool prog load tc_prog.bpf.o /sys/fs/bpf/tc_prog # bpftool prog show ... 192: sched_cls name tc_prog tag 187aeb611ad00cfc gpl loaded_at 2024-07-11T15:58:16+0800 uid 0 xlated 152B jited 97B memlock 4096B map_ids 100,99,97 btf_id 260 # bpftool net attach tcx_ingress name tc_prog dev lo # bpftool net ... tc: lo(1) tcx/ingress tc_prog prog_id 29 # bpftool net detach tcx_ingress dev lo # bpftool net ... tc: # bpftool net attach tcx_ingress name tc_prog dev lo # bpftool net tc: lo(1) tcx/ingress tc_prog prog_id 29 Test environment: ubuntu_22_04, 6.7.0-060700-generic Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721144221.96228-1-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- tools/bpf/bpftool/net.c | 43 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 1b9f4225b394..2a51f1c25732 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -67,6 +67,8 @@ enum net_attach_type { NET_ATTACH_TYPE_XDP_GENERIC, NET_ATTACH_TYPE_XDP_DRIVER, NET_ATTACH_TYPE_XDP_OFFLOAD, + NET_ATTACH_TYPE_TCX_INGRESS, + NET_ATTACH_TYPE_TCX_EGRESS, }; static const char * const attach_type_strings[] = { @@ -74,6 +76,8 @@ static const char * const attach_type_strings[] = { [NET_ATTACH_TYPE_XDP_GENERIC] = "xdpgeneric", [NET_ATTACH_TYPE_XDP_DRIVER] = "xdpdrv", [NET_ATTACH_TYPE_XDP_OFFLOAD] = "xdpoffload", + [NET_ATTACH_TYPE_TCX_INGRESS] = "tcx_ingress", + [NET_ATTACH_TYPE_TCX_EGRESS] = "tcx_egress", }; static const char * const attach_loc_strings[] = { @@ -647,6 +651,32 @@ static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type, return bpf_xdp_attach(ifindex, progfd, flags, NULL); } +static int get_tcx_type(enum net_attach_type attach_type) +{ + switch (attach_type) { + case NET_ATTACH_TYPE_TCX_INGRESS: + return BPF_TCX_INGRESS; + case NET_ATTACH_TYPE_TCX_EGRESS: + return BPF_TCX_EGRESS; + default: + return -1; + } +} + +static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex) +{ + int type = get_tcx_type(attach_type); + + return bpf_prog_attach(progfd, ifindex, type, 0); +} + +static int do_detach_tcx(int targetfd, enum net_attach_type attach_type) +{ + int type = get_tcx_type(attach_type); + + return bpf_prog_detach(targetfd, type); +} + static int do_attach(int argc, char **argv) { enum net_attach_type attach_type; @@ -692,6 +722,11 @@ static int do_attach(int argc, char **argv) case NET_ATTACH_TYPE_XDP_OFFLOAD: err = do_attach_detach_xdp(progfd, attach_type, ifindex, overwrite); break; + /* attach tcx prog */ + case NET_ATTACH_TYPE_TCX_INGRESS: + case NET_ATTACH_TYPE_TCX_EGRESS: + err = do_attach_tcx(progfd, attach_type, ifindex); + break; default: break; } @@ -738,6 +773,11 @@ static int do_detach(int argc, char **argv) progfd = -1; err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL); break; + /* detach tcx prog */ + case NET_ATTACH_TYPE_TCX_INGRESS: + case NET_ATTACH_TYPE_TCX_EGRESS: + err = do_detach_tcx(ifindex, attach_type); + break; default: break; } @@ -944,7 +984,8 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_PROGRAM "\n" - " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n" + " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload | tcx_ingress\n" + " | tcx_egress }\n" " " HELP_SPEC_OPTIONS " }\n" "\n" "Note: Only xdp, tcx, tc, netkit, flow_dissector and netfilter attachments\n" From 4f88dde0e1525ee69d29ba235c3965685439d0e3 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:42:38 +0800 Subject: [PATCH 0212/1052] bpftool: Add bash-completion for tcx subcommand This commit adds bash-completion for attaching tcx program on interface. Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721144238.96246-1-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- tools/bpf/bpftool/bash-completion/bpftool | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index be99d49b8714..0c541498c301 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1079,7 +1079,7 @@ _bpftool() esac ;; net) - local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' + local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload tcx_ingress tcx_egress' case $command in show|list) [[ $prev != "$command" ]] && return 0 From 0d7c06125cea53b3d86d685d790b03b9ae9d6336 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:42:52 +0800 Subject: [PATCH 0213/1052] bpftool: Add document for net attach/detach on tcx subcommand This commit adds sample output for net attach/detach on tcx subcommand. Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721144252.96264-1-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- .../bpf/bpftool/Documentation/bpftool-net.rst | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 348812881297..4a8cb5e0d94b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -29,7 +29,7 @@ NET COMMANDS | **bpftool** **net help** | | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** | **tcx_ingress** | **tcx_egress** } DESCRIPTION =========== @@ -69,6 +69,8 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; **xdpdrv** - Native XDP. runs earliest point in driver's receive path; **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + **tcx_ingress** - Ingress TCX. runs on ingress net traffic; + **tcx_egress** - Egress TCX. runs on egress net traffic; bpftool net detach *ATTACH_TYPE* dev *NAME* Detach bpf program attached to network interface *NAME* with type specified @@ -178,3 +180,21 @@ EXAMPLES :: xdp: + +| +| **# bpftool net attach tcx_ingress name tc_prog dev lo** +| **# bpftool net** +| + +:: + tc: + lo(1) tcx/ingress tc_prog prog_id 29 + +| +| **# bpftool net attach tcx_ingress name tc_prog dev lo** +| **# bpftool net detach tcx_ingress dev lo** +| **# bpftool net** +| + +:: + tc: From e42ac14180554fa23a3312d4f921dc4ea7972fb7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 22 Jul 2024 11:30:45 -0700 Subject: [PATCH 0214/1052] bpf: Check unsupported ops from the bpf_struct_ops's cfi_stubs The bpf_tcp_ca struct_ops currently uses a "u32 unsupported_ops[]" array to track which ops is not supported. After cfi_stubs had been added, the function pointer in cfi_stubs is also NULL for the unsupported ops. Thus, the "u32 unsupported_ops[]" becomes redundant. This observation was originally brought up in the bpf/cfi discussion: https://lore.kernel.org/bpf/CAADnVQJoEkdjyCEJRPASjBw1QGsKYrF33QdMGc1RZa9b88bAEA@mail.gmail.com/ The recent bpf qdisc patch (https://lore.kernel.org/bpf/20240714175130.4051012-6-amery.hung@bytedance.com/) also needs to specify quite many unsupported ops. It is a good time to clean it up. This patch removes the need of "u32 unsupported_ops[]" and tests for null-ness in the cfi_stubs instead. Testing the cfi_stubs is done in a new function bpf_struct_ops_supported(). The verifier will call bpf_struct_ops_supported() when loading the struct_ops program. The ".check_member" is removed from the bpf_tcp_ca in this patch. ".check_member" could still be useful for other subsytems to enforce other restrictions (e.g. sched_ext checks for prog->sleepable). To keep the same error return, ENOTSUPP is used. Cc: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240722183049.2254692-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 5 +++++ kernel/bpf/bpf_struct_ops.c | 7 +++++++ kernel/bpf/verifier.c | 10 +++++++++- net/ipv4/bpf_tcp_ca.c | 26 -------------------------- 4 files changed, 21 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b94ec161e8c..4c54864316ee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1795,6 +1795,7 @@ struct bpf_struct_ops_common_value { #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, @@ -1851,6 +1852,10 @@ static inline void bpf_module_put(const void *data, struct module *owner) { module_put(owner); } +static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + return -ENOTSUPP; +} static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index bb3eabc0dc76..fda3dd2ee984 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1040,6 +1040,13 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f8c474e8e597..247a038d3a14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21132,6 +21132,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) u32 btf_id, member_idx; struct btf *btf; const char *mname; + int err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -21179,8 +21180,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + if (err) { + verbose(env, "attach to unsupported member %s of struct %s\n", + mname, st_ops->name); + return err; + } + if (st_ops->check_member) { - int err = st_ops->check_member(t, member, prog); + err = st_ops->check_member(t, member, prog); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 3f88d0961e5b..554804774628 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -14,10 +14,6 @@ /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ static struct bpf_struct_ops bpf_tcp_congestion_ops; -static u32 unsupported_ops[] = { - offsetof(struct tcp_congestion_ops, get_info), -}; - static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; static const struct btf_type *tcp_congestion_ops_type; @@ -45,18 +41,6 @@ static int bpf_tcp_ca_init(struct btf *btf) return 0; } -static bool is_unsupported(u32 member_offset) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { - if (member_offset == unsupported_ops[i]) - return true; - } - - return false; -} - static bool bpf_tcp_ca_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -251,15 +235,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, return 0; } -static int bpf_tcp_ca_check_member(const struct btf_type *t, - const struct btf_member *member, - const struct bpf_prog *prog) -{ - if (is_unsupported(__btf_member_bit_offset(t, member) / 8)) - return -ENOTSUPP; - return 0; -} - static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link) { return tcp_register_congestion_control(kdata); @@ -354,7 +329,6 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = { .reg = bpf_tcp_ca_reg, .unreg = bpf_tcp_ca_unreg, .update = bpf_tcp_ca_update, - .check_member = bpf_tcp_ca_check_member, .init_member = bpf_tcp_ca_init_member, .init = bpf_tcp_ca_init, .validate = bpf_tcp_ca_validate, From e44b4fc40cb429f6dedc4518a16221115035f654 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 22 Jul 2024 11:30:46 -0700 Subject: [PATCH 0215/1052] selftests/bpf: Fix the missing tramp_1 to tramp_40 ops in cfi_stubs The tramp_1 to tramp_40 ops is not set in the cfi_stubs in the bpf_testmod_ops. It fails the struct_ops_multi_pages test after retiring the unsupported_ops in the earlier patch. This patch initializes them in a loop during the bpf_testmod_init(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240722183049.2254692-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index fd28c1157bd3..3687a40b61c6 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -1024,6 +1024,11 @@ static void bpf_testmod_test_2(int a, int b) { } +static int bpf_testmod_tramp(int value) +{ + return 0; +} + static int bpf_testmod_ops__test_maybe_null(int dummy, struct task_struct *task__nullable) { @@ -1080,6 +1085,7 @@ static int bpf_testmod_init(void) .kfunc_btf_id = bpf_testmod_dtor_ids[1] }, }; + void **tramp; int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_testmod_common_kfunc_set); @@ -1103,6 +1109,14 @@ static int bpf_testmod_init(void) ret = register_bpf_testmod_uprobe(); if (ret < 0) return ret; + + /* Ensure nothing is between tramp_1..tramp_40 */ + BUILD_BUG_ON(offsetof(struct bpf_testmod_ops, tramp_1) + 40 * sizeof(long) != + offsetofend(struct bpf_testmod_ops, tramp_40)); + tramp = (void **)&__bpf_testmod_ops.tramp_1; + while (tramp <= (void **)&__bpf_testmod_ops.tramp_40) + *tramp++ = bpf_testmod_tramp; + return 0; } From 4009c95fede6b783802ad01f264a7a0541f5ea60 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 22 Jul 2024 11:30:47 -0700 Subject: [PATCH 0216/1052] selftests/bpf: Ensure the unsupported struct_ops prog cannot be loaded There is an existing "bpf_tcp_ca/unsupp_cong_op" test to ensure the unsupported tcp-cc "get_info" struct_ops prog cannot be loaded. This patch adds a new test in the bpf_testmod such that the unsupported ops test does not depend on other kernel subsystem where its supporting ops may be changed in the future. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240722183049.2254692-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/bpf_testmod/bpf_testmod.h | 1 + .../bpf/prog_tests/test_struct_ops_module.c | 2 ++ .../selftests/bpf/progs/unsupported_ops.c | 22 +++++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/unsupported_ops.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index 23fa1872ee67..fe0d402b0d65 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -35,6 +35,7 @@ struct bpf_testmod_ops { void (*test_2)(int a, int b); /* Used to test nullable arguments. */ int (*test_maybe_null)(int dummy, struct task_struct *task); + int (*unsupported_ops)(void); /* The following fields are used to test shadow copies. */ char onebyte; diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index bbcf12696a6b..75a0dea511b3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -9,6 +9,7 @@ #include "struct_ops_nulled_out_cb.skel.h" #include "struct_ops_forgotten_cb.skel.h" #include "struct_ops_detach.skel.h" +#include "unsupported_ops.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -311,5 +312,6 @@ void serial_test_struct_ops_module(void) test_struct_ops_forgotten_cb(); if (test__start_subtest("test_detach_link")) test_detach_link(); + RUN_TESTS(unsupported_ops); } diff --git a/tools/testing/selftests/bpf/progs/unsupported_ops.c b/tools/testing/selftests/bpf/progs/unsupported_ops.c new file mode 100644 index 000000000000..9180365a3568 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/unsupported_ops.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/unsupported_ops") +__failure +__msg("attach to unsupported member unsupported_ops of struct bpf_testmod_ops") +int BPF_PROG(unsupported_ops) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops testmod = { + .unsupported_ops = (void *)unsupported_ops, +}; From 21c7063f6d08ab9afa088584939791bee0c177e5 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:51 +0800 Subject: [PATCH 0217/1052] bpf, lsm: Add disabled BPF LSM hook list Add a disabled hooks list for BPF LSM. progs being attached to the listed hooks will be rejected by the verifier. Suggested-by: KP Singh Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/bpf_lsm.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 08a338e1f231..1f596ad6257c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -36,6 +36,24 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +BTF_SET_START(bpf_lsm_disabled_hooks) +BTF_ID(func, bpf_lsm_vm_enough_memory) +BTF_ID(func, bpf_lsm_inode_need_killpriv) +BTF_ID(func, bpf_lsm_inode_getsecurity) +BTF_ID(func, bpf_lsm_inode_listsecurity) +BTF_ID(func, bpf_lsm_inode_copy_up_xattr) +BTF_ID(func, bpf_lsm_getselfattr) +BTF_ID(func, bpf_lsm_getprocattr) +BTF_ID(func, bpf_lsm_setprocattr) +#ifdef CONFIG_KEYS +BTF_ID(func, bpf_lsm_key_getsecurity) +#endif +#ifdef CONFIG_AUDIT +BTF_ID(func, bpf_lsm_audit_rule_match) +#endif +BTF_ID(func, bpf_lsm_ismaclabel) +BTF_SET_END(bpf_lsm_disabled_hooks) + /* List of LSM hooks that should operate on 'current' cgroup regardless * of function signature. */ @@ -97,15 +115,24 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { + u32 btf_id = prog->aux->attach_btf_id; + const char *func_name = prog->aux->attach_func_name; + if (!prog->gpl_compatible) { bpf_log(vlog, "LSM programs must have a GPL compatible license\n"); return -EINVAL; } - if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { + if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n", + btf_id, func_name); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", - prog->aux->attach_btf_id, prog->aux->attach_func_name); + btf_id, func_name); return -EINVAL; } From 5d99e198be279045e6ecefe220f5c52f8ce9bfd5 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:52 +0800 Subject: [PATCH 0218/1052] bpf, lsm: Add check for BPF LSM return value A bpf prog returning a positive number attached to file_alloc_security hook makes kernel panic. This happens because file system can not filter out the positive number returned by the LSM prog using IS_ERR, and misinterprets this positive number as a file pointer. Given that hook file_alloc_security never returned positive number before the introduction of BPF LSM, and other BPF LSM hooks may encounter similar issues, this patch adds LSM return value check in verifier, to ensure no unexpected value is returned. Fixes: 520b7aa00d8c ("bpf: lsm: Initialize the BPF LSM hooks") Reported-by: Xin Liu Signed-off-by: Xu Kuohai Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240719110059.797546-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + include/linux/bpf_lsm.h | 8 ++++++ kernel/bpf/bpf_lsm.c | 34 ++++++++++++++++++++++- kernel/bpf/btf.c | 5 +++- kernel/bpf/verifier.c | 60 ++++++++++++++++++++++++++++++++++------- 5 files changed, 97 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c54864316ee..5739cc9986f8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -927,6 +927,7 @@ struct bpf_insn_access_aux { }; }; struct bpf_verifier_log *log; /* for verbose logs */ + bool is_retval; /* is accessing function return value ? */ }; static inline void diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 1de7ece5d36d..aefcd6564251 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -9,6 +9,7 @@ #include #include +#include #include #ifdef CONFIG_BPF_LSM @@ -45,6 +46,8 @@ void bpf_inode_storage_free(struct inode *inode); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); +int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *range); #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) @@ -78,6 +81,11 @@ static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, { } +static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *range) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 1f596ad6257c..6292ac5f9bd1 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -417,3 +416,36 @@ const struct bpf_verifier_ops lsm_verifier_ops = { .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; + +/* hooks return 0 or 1 */ +BTF_SET_START(bool_lsm_hooks) +#ifdef CONFIG_SECURITY_NETWORK_XFRM +BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match) +#endif +#ifdef CONFIG_AUDIT +BTF_ID(func, bpf_lsm_audit_rule_known) +#endif +BTF_ID(func, bpf_lsm_inode_xattr_skipcap) +BTF_SET_END(bool_lsm_hooks) + +int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *retval_range) +{ + /* no return value range for void hooks */ + if (!prog->aux->attach_func_proto->type) + return -EINVAL; + + if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) { + retval_range->minval = 0; + retval_range->maxval = 1; + } else { + /* All other available LSM hooks, except task_prctl, return 0 + * on success and negative error code on failure. + * To keep things simple, we only allow bpf progs to return 0 + * or negative errno for task_prctl too. + */ + retval_range->minval = -MAX_ERRNO; + retval_range->maxval = 0; + } + return 0; +} diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 520f49f422fe..95426d5b634e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6416,8 +6416,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { - case BPF_LSM_CGROUP: case BPF_LSM_MAC: + /* mark we are accessing the return value */ + info->is_retval = true; + fallthrough; + case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 247a038d3a14..8caa7322f031 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2334,6 +2334,25 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, __mark_reg_unknown(env, regs + regno); } +static int __mark_reg_s32_range(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + u32 regno, + s32 s32_min, + s32 s32_max) +{ + struct bpf_reg_state *reg = regs + regno; + + reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min); + reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max); + + reg->smin_value = max_t(s64, reg->smin_value, s32_min); + reg->smax_value = min_t(s64, reg->smax_value, s32_max); + + reg_bounds_sync(reg); + + return reg_bounds_sanity_check(env, reg, "s32_range"); +} + static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -5607,11 +5626,12 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id) + struct btf **btf, u32 *btf_id, bool *is_retval) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, .log = &env->log, + .is_retval = false, }; if (env->ops->is_valid_access && @@ -5624,6 +5644,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * type of narrower access. */ *reg_type = info.reg_type; + *is_retval = info.is_retval; if (base_type(*reg_type) == PTR_TO_BTF_ID) { *btf = info.btf; @@ -6792,6 +6813,17 @@ static int check_stack_access_within_bounds( return grow_stack_state(env, state, -min_off /* size */); } +static bool get_func_retval_range(struct bpf_prog *prog, + struct bpf_retval_range *range) +{ + if (prog->type == BPF_PROG_TYPE_LSM && + prog->expected_attach_type == BPF_LSM_MAC && + !bpf_lsm_get_retval_range(prog, range)) { + return true; + } + return false; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -6896,6 +6928,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { + bool is_retval = false; + struct bpf_retval_range range; enum bpf_reg_type reg_type = SCALAR_VALUE; struct btf *btf = NULL; u32 btf_id = 0; @@ -6911,7 +6945,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id); + &btf_id, &is_retval); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -6920,7 +6954,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); + if (is_retval && get_func_retval_range(env->prog, &range)) { + err = __mark_reg_s32_range(env, regs, value_regno, + range.minval, range.maxval); + if (err) + return err; + } else { + mark_reg_unknown(env, regs, value_regno); + } } else { mark_reg_known_zero(env, regs, value_regno); @@ -15762,12 +15803,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_LSM: if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { - /* Regular BPF_PROG_TYPE_LSM programs can return - * any value. - */ - return 0; - } - if (!env->prog->aux->attach_func_proto->type) { + /* no range found, any return value is allowed */ + if (!get_func_retval_range(env->prog, &range)) + return 0; + /* no restricted range, any return value is allowed */ + if (range.minval == S32_MIN && range.maxval == S32_MAX) + return 0; + } else if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. */ From 28ead3eaabc16ecc907cfb71876da028080f6356 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:53 +0800 Subject: [PATCH 0219/1052] bpf: Prevent tail call between progs attached to different hooks bpf progs can be attached to kernel functions, and the attached functions can take different parameters or return different return values. If prog attached to one kernel function tail calls prog attached to another kernel function, the ctx access or return value verification could be bypassed. For example, if prog1 is attached to func1 which takes only 1 parameter and prog2 is attached to func2 which takes two parameters. Since verifier assumes the bpf ctx passed to prog2 is constructed based on func2's prototype, verifier allows prog2 to access the second parameter from the bpf ctx passed to it. The problem is that verifier does not prevent prog1 from passing its bpf ctx to prog2 via tail call. In this case, the bpf ctx passed to prog2 is constructed from func1 instead of func2, that is, the assumption for ctx access verification is bypassed. Another example, if BPF LSM prog1 is attached to hook file_alloc_security, and BPF LSM prog2 is attached to hook bpf_lsm_audit_rule_known. Verifier knows the return value rules for these two hooks, e.g. it is legal for bpf_lsm_audit_rule_known to return positive number 1, and it is illegal for file_alloc_security to return positive number. So verifier allows prog2 to return positive number 1, but does not allow prog1 to return positive number. The problem is that verifier does not prevent prog1 from calling prog2 via tail call. In this case, prog2's return value 1 will be used as the return value for prog1's hook file_alloc_security. That is, the return value rule is bypassed. This patch adds restriction for tail call to prevent such bypasses. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5739cc9986f8..f16d0753f518 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -294,6 +294,7 @@ struct bpf_map { * same prog type, JITed flag and xdp_has_frags flag. */ struct { + const struct btf_type *attach_func_proto; spinlock_t lock; enum bpf_prog_type type; bool jited; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7ee62e38faf0..4e07cc057d6f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2302,6 +2302,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map, { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; + struct bpf_prog_aux *aux = fp->aux; if (fp->kprobe_override) return false; @@ -2311,7 +2312,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map, * in the case of devmap and cpumap). Until device checks * are implemented, prohibit adding dev-bound programs to program maps. */ - if (bpf_prog_is_dev_bound(fp->aux)) + if (bpf_prog_is_dev_bound(aux)) return false; spin_lock(&map->owner.lock); @@ -2321,12 +2322,26 @@ bool bpf_prog_map_compatible(struct bpf_map *map, */ map->owner.type = prog_type; map->owner.jited = fp->jited; - map->owner.xdp_has_frags = fp->aux->xdp_has_frags; + map->owner.xdp_has_frags = aux->xdp_has_frags; + map->owner.attach_func_proto = aux->attach_func_proto; ret = true; } else { ret = map->owner.type == prog_type && map->owner.jited == fp->jited && - map->owner.xdp_has_frags == fp->aux->xdp_has_frags; + map->owner.xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->owner.attach_func_proto != aux->attach_func_proto) { + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_STRUCT_OPS: + ret = false; + break; + default: + break; + } + } } spin_unlock(&map->owner.lock); From 763aa759d3b2c4f95b11855e3d37b860860107e2 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:54 +0800 Subject: [PATCH 0220/1052] bpf: Fix compare error in function retval_range_within After checking lsm hook return range in verifier, the test case "test_progs -t test_lsm" failed, and the failure log says: libbpf: prog 'test_int_hook': BPF program load failed: Invalid argument libbpf: prog 'test_int_hook': -- BEGIN PROG LOAD LOG -- 0: R1=ctx() R10=fp0 ; int BPF_PROG(test_int_hook, struct vm_area_struct *vma, @ lsm.c:89 0: (79) r0 = *(u64 *)(r1 +24) ; R0_w=scalar(smin=smin32=-4095,smax=smax32=0) R1=ctx() [...] 24: (b4) w0 = -1 ; R0_w=0xffffffff ; int BPF_PROG(test_int_hook, struct vm_area_struct *vma, @ lsm.c:89 25: (95) exit At program exit the register R0 has smin=4294967295 smax=4294967295 should have been in [-4095, 0] It can be seen that instruction "w0 = -1" zero extended -1 to 64-bit register r0, setting both smin and smax values of r0 to 4294967295. This resulted in a false reject when r0 was checked with range [-4095, 0]. Given bpf lsm does not return 64-bit values, this patch fixes it by changing the compare between r0 and return range from 64-bit operation to 32-bit operation for bpf lsm. Fixes: 8fa4ecd49b81 ("bpf: enforce exact retval range on subprog/callback exit") Signed-off-by: Xu Kuohai Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20240719110059.797546-5-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8caa7322f031..c419005a29dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9984,9 +9984,13 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } -static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg, + bool return_32bit) { - return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; + if (return_32bit) + return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; + else + return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -10023,8 +10027,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) if (err) return err; - /* enforce R0 return value range */ - if (!retval_range_within(callee->callback_ret_range, r0)) { + /* enforce R0 return value range, and bpf_callback_t returns 64bit */ + if (!retval_range_within(callee->callback_ret_range, r0, false)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, "At callback return", "R0"); return -EINVAL; @@ -15698,6 +15702,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; + bool return_32bit = false; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { @@ -15809,6 +15814,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char /* no restricted range, any return value is allowed */ if (range.minval == S32_MIN && range.maxval == S32_MAX) return 0; + return_32bit = true; } else if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. @@ -15839,7 +15845,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char if (err) return err; - if (!retval_range_within(range, reg)) { + if (!retval_range_within(range, reg, return_32bit)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); if (!is_subprog && prog->expected_attach_type == BPF_LSM_CGROUP && From 4dc7556490d77d5046844e7731d5d0a3055a3448 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:56 +0800 Subject: [PATCH 0221/1052] selftests/bpf: Avoid load failure for token_lsm.c The compiler optimized the two bpf progs in token_lsm.c to make return value from the bool variable in the "return -1" path, causing an unexpected rejection: 0: R1=ctx() R10=fp0 ; int BPF_PROG(bpf_token_capable, struct bpf_token *token, int cap) @ bpf_lsm.c:17 0: (b7) r6 = 0 ; R6_w=0 ; if (my_pid == 0 || my_pid != (bpf_get_current_pid_tgid() >> 32)) @ bpf_lsm.c:19 1: (18) r1 = 0xffffc9000102a000 ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5) 3: (61) r7 = *(u32 *)(r1 +0) ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5) R7_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 4: (15) if r7 == 0x0 goto pc+11 ; R7_w=scalar(smin=umin=umin32=1,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 5: (67) r7 <<= 32 ; R7_w=scalar(smax=0x7fffffff00000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000)) 6: (c7) r7 s>>= 32 ; R7_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff) 7: (85) call bpf_get_current_pid_tgid#14 ; R0=scalar() 8: (77) r0 >>= 32 ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 9: (5d) if r0 != r7 goto pc+6 ; R0_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) R7=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) ; if (reject_capable) @ bpf_lsm.c:21 10: (18) r1 = 0xffffc9000102a004 ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5,off=4) 12: (71) r6 = *(u8 *)(r1 +0) ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5,off=4) R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) ; @ bpf_lsm.c:0 13: (87) r6 = -r6 ; R6_w=scalar() 14: (67) r6 <<= 56 ; R6_w=scalar(smax=0x7f00000000000000,umax=0xff00000000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xff00000000000000)) 15: (c7) r6 s>>= 56 ; R6_w=scalar(smin=smin32=-128,smax=smax32=127) ; int BPF_PROG(bpf_token_capable, struct bpf_token *token, int cap) @ bpf_lsm.c:17 16: (bf) r0 = r6 ; R0_w=scalar(id=1,smin=smin32=-128,smax=smax32=127) R6_w=scalar(id=1,smin=smin32=-128,smax=smax32=127) 17: (95) exit At program exit the register R0 has smin=-128 smax=127 should have been in [-4095, 0] To avoid this failure, change the variable type from bool to int. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-7-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/token_lsm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/token_lsm.c b/tools/testing/selftests/bpf/progs/token_lsm.c index e4d59b6ba743..a6002d073b1b 100644 --- a/tools/testing/selftests/bpf/progs/token_lsm.c +++ b/tools/testing/selftests/bpf/progs/token_lsm.c @@ -8,8 +8,8 @@ char _license[] SEC("license") = "GPL"; int my_pid; -bool reject_capable; -bool reject_cmd; +int reject_capable; +int reject_cmd; SEC("lsm/bpf_token_capable") int BPF_PROG(token_capable, struct bpf_token *token, int cap) From 2b23b6c0f03c94dbacff2abdfd45490eca9d7f6e Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:57 +0800 Subject: [PATCH 0222/1052] selftests/bpf: Add return value checks for failed tests The return ranges of some bpf lsm test progs can not be deduced by the verifier accurately. To avoid erroneous rejections, add explicit return value checks for these progs. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-8-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/err.h | 10 ++++++++++ tools/testing/selftests/bpf/progs/test_sig_in_xattr.c | 4 ++++ .../selftests/bpf/progs/test_verify_pkcs7_sig.c | 8 ++++++-- .../selftests/bpf/progs/verifier_global_subprogs.c | 7 ++++++- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/err.h b/tools/testing/selftests/bpf/progs/err.h index d66d283d9e59..38529779a236 100644 --- a/tools/testing/selftests/bpf/progs/err.h +++ b/tools/testing/selftests/bpf/progs/err.h @@ -5,6 +5,16 @@ #define MAX_ERRNO 4095 #define IS_ERR_VALUE(x) (unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO +#define __STR(x) #x + +#define set_if_not_errno_or_zero(x, y) \ +({ \ + asm volatile ("if %0 s< -4095 goto +1\n" \ + "if %0 s<= 0 goto +1\n" \ + "%0 = " __STR(y) "\n" \ + : "+r"(x)); \ +}) + static inline int IS_ERR_OR_NULL(const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); diff --git a/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c b/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c index 2f0eb1334d65..8ef6b39335b6 100644 --- a/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c +++ b/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c @@ -6,6 +6,7 @@ #include #include #include "bpf_kfuncs.h" +#include "err.h" char _license[] SEC("license") = "GPL"; @@ -79,5 +80,8 @@ int BPF_PROG(test_file_open, struct file *f) ret = bpf_verify_pkcs7_signature(&digest_ptr, &sig_ptr, trusted_keyring); bpf_key_put(trusted_keyring); + + set_if_not_errno_or_zero(ret, -EFAULT); + return ret; } diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c index f42e9f3831a1..12034a73ee2d 100644 --- a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c +++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c @@ -11,6 +11,7 @@ #include #include #include "bpf_kfuncs.h" +#include "err.h" #define MAX_DATA_SIZE (1024 * 1024) #define MAX_SIG_SIZE 1024 @@ -55,12 +56,12 @@ int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) ret = bpf_probe_read_kernel(&value, sizeof(value), &attr->value); if (ret) - return ret; + goto out; ret = bpf_copy_from_user(data_val, sizeof(struct data), (void *)(unsigned long)value); if (ret) - return ret; + goto out; if (data_val->data_len > sizeof(data_val->data)) return -EINVAL; @@ -84,5 +85,8 @@ int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) bpf_key_put(trusted_keyring); +out: + set_if_not_errno_or_zero(ret, -EFAULT); + return ret; } diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index a9fc30ed4d73..20904cd2baa2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -7,6 +7,7 @@ #include "bpf_misc.h" #include "xdp_metadata.h" #include "bpf_kfuncs.h" +#include "err.h" /* The compiler may be able to detect the access to uninitialized memory in the routines performing out of bound memory accesses and @@ -331,7 +332,11 @@ SEC("?lsm/bpf") __success __log_level(2) int BPF_PROG(arg_tag_ctx_lsm) { - return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + int ret; + + ret = tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + set_if_not_errno_or_zero(ret, -1); + return ret; } SEC("?struct_ops/test_1") From d463dd9c9aa24b17ccb8ed76bdd7768baf857b48 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:58 +0800 Subject: [PATCH 0223/1052] selftests/bpf: Add test for lsm tail call Add test for lsm tail call to ensure tail call can only be used between bpf lsm progs attached to the same hook. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-9-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/test_lsm.c | 46 ++++++++++++++++++- .../selftests/bpf/progs/lsm_tailcall.c | 34 ++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/lsm_tailcall.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index 16175d579bc7..2a27f3714f5c 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -12,6 +12,7 @@ #include #include "lsm.skel.h" +#include "lsm_tailcall.skel.h" char *CMD_ARGS[] = {"true", NULL}; @@ -95,7 +96,7 @@ static int test_lsm(struct lsm *skel) return 0; } -void test_test_lsm(void) +static void test_lsm_basic(void) { struct lsm *skel = NULL; int err; @@ -114,3 +115,46 @@ void test_test_lsm(void) close_prog: lsm__destroy(skel); } + +static void test_lsm_tailcall(void) +{ + struct lsm_tailcall *skel = NULL; + int map_fd, prog_fd; + int err, key; + + skel = lsm_tailcall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "lsm_tailcall__skel_load")) + goto close_prog; + + map_fd = bpf_map__fd(skel->maps.jmp_table); + if (CHECK_FAIL(map_fd < 0)) + goto close_prog; + + prog_fd = bpf_program__fd(skel->progs.lsm_file_permission_prog); + if (CHECK_FAIL(prog_fd < 0)) + goto close_prog; + + key = 0; + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (CHECK_FAIL(!err)) + goto close_prog; + + prog_fd = bpf_program__fd(skel->progs.lsm_file_alloc_security_prog); + if (CHECK_FAIL(prog_fd < 0)) + goto close_prog; + + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto close_prog; + +close_prog: + lsm_tailcall__destroy(skel); +} + +void test_test_lsm(void) +{ + if (test__start_subtest("lsm_basic")) + test_lsm_basic(); + if (test__start_subtest("lsm_tailcall")) + test_lsm_tailcall(); +} diff --git a/tools/testing/selftests/bpf/progs/lsm_tailcall.c b/tools/testing/selftests/bpf/progs/lsm_tailcall.c new file mode 100644 index 000000000000..49c075ce2d4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm_tailcall.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Huawei Technologies Co., Ltd */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +SEC("lsm/file_permission") +int lsm_file_permission_prog(void *ctx) +{ + return 0; +} + +SEC("lsm/file_alloc_security") +int lsm_file_alloc_security_prog(void *ctx) +{ + return 0; +} + +SEC("lsm/file_alloc_security") +int lsm_file_alloc_security_entry(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} From 04d8243b1f83251bd599fdd3ea91e89039b6a557 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:59 +0800 Subject: [PATCH 0224/1052] selftests/bpf: Add verifier tests for bpf lsm Add verifier tests to check bpf lsm return values and disabled hooks. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-10-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_lsm.c | 162 ++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_lsm.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 9dc3687bc406..ff1c7da1d06e 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -88,6 +88,7 @@ #include "verifier_xdp.skel.h" #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" +#include "verifier_lsm.skel.h" #define MAX_ENTRIES 11 @@ -206,6 +207,7 @@ void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); } void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } +void test_verifier_lsm(void) { RUN(verifier_lsm); } static int init_test_val_map(struct bpf_object *obj, char *map_name) { diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c new file mode 100644 index 000000000000..32e5e779cb96 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("lsm/file_alloc_security") +__description("lsm bpf prog with -4095~0 retval. test 1") +__success +__naked int errno_zero_retval_test1(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_alloc_security") +__description("lsm bpf prog with -4095~0 retval. test 2") +__success +__naked int errno_zero_retval_test2(void *ctx) +{ + asm volatile ( + "r0 = -4095;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_mprotect") +__description("lsm bpf prog with -4095~0 retval. test 4") +__failure __msg("R0 has smin=-4096 smax=-4096 should have been in [-4095, 0]") +__naked int errno_zero_retval_test4(void *ctx) +{ + asm volatile ( + "r0 = -4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_mprotect") +__description("lsm bpf prog with -4095~0 retval. test 5") +__failure __msg("R0 has smin=4096 smax=4096 should have been in [-4095, 0]") +__naked int errno_zero_retval_test5(void *ctx) +{ + asm volatile ( + "r0 = 4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_mprotect") +__description("lsm bpf prog with -4095~0 retval. test 6") +__failure __msg("R0 has smin=1 smax=1 should have been in [-4095, 0]") +__naked int errno_zero_retval_test6(void *ctx) +{ + asm volatile ( + "r0 = 1;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 1") +__success +__naked int bool_retval_test1(void *ctx) +{ + asm volatile ( + "r0 = 1;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 2") +__success +__success +__naked int bool_retval_test2(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 3") +__failure __msg("R0 has smin=-1 smax=-1 should have been in [0, 1]") +__naked int bool_retval_test3(void *ctx) +{ + asm volatile ( + "r0 = -1;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 4") +__failure __msg("R0 has smin=2 smax=2 should have been in [0, 1]") +__naked int bool_retval_test4(void *ctx) +{ + asm volatile ( + "r0 = 2;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_free_security") +__success +__description("lsm bpf prog with void retval. test 1") +__naked int void_retval_test1(void *ctx) +{ + asm volatile ( + "r0 = -4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_free_security") +__success +__description("lsm bpf prog with void retval. test 2") +__naked int void_retval_test2(void *ctx) +{ + asm volatile ( + "r0 = 4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/getprocattr") +__description("lsm disabled hook: getprocattr") +__failure __msg("points to disabled hook") +__naked int disabled_hook_test1(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/setprocattr") +__description("lsm disabled hook: setprocattr") +__failure __msg("points to disabled hook") +__naked int disabled_hook_test2(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/ismaclabel") +__description("lsm disabled hook: ismaclabel") +__failure __msg("points to disabled hook") +__naked int disabled_hook_test3(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +char _license[] SEC("license") = "GPL"; From 41c24102af7b6236277a214428b203d51a3462df Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2024 14:40:29 -0700 Subject: [PATCH 0225/1052] selftests/bpf: Filter out _GNU_SOURCE when compiling test_cpp Jakub reports build failures when merging linux/master with net tree: CXX test_cpp In file included from :454: :2:9: error: '_GNU_SOURCE' macro redefined [-Werror,-Wmacro-redefined] 2 | #define _GNU_SOURCE | ^ :445:9: note: previous definition is here 445 | #define _GNU_SOURCE 1 The culprit is commit cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk") which unconditionally added -D_GNU_SOUCE to CLFAGS. Apparently clang++ also unconditionally adds it for the C++ targets [0] which causes a conflict. Add small change in the selftests makefile to filter it out for test_cpp. Not sure which tree it should go via, targeting bpf for now, but net might be better? 0: https://stackoverflow.com/questions/11670581/why-is-gnu-source-defined-by-default-and-how-to-turn-it-off Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240725214029.1760809-1-sdf@fomichev.me --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dd49c1d23a60..81d4757ecd4c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) - $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) From aa8ebb270c66cea1f56a25d0f938036e91ad085a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 22 Jul 2024 19:08:15 -0700 Subject: [PATCH 0226/1052] selftests/bpf: Workaround strict bpf_lsm return value check. test_progs-no_alu32 -t libbpf_get_fd_by_id_opts is being rejected by the verifier with the following error due to compiler optimization: 6: (67) r0 <<= 62 ; R0_w=scalar(smax=0x4000000000000000,umax=0xc000000000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xc000000000000000)) 7: (c7) r0 s>>= 63 ; R0_w=scalar(smin=smin32=-1,smax=smax32=0) ; @ test_libbpf_get_fd_by_id_opts.c:0 8: (57) r0 &= -13 ; R0_w=scalar(smax=0x7ffffffffffffff3,umax=0xfffffffffffffff3,smax32=0x7ffffff3,umax32=0xfffffff3,var_off=(0x0; 0xfffffffffffffff3)) ; int BPF_PROG(check_access, struct bpf_map *map, fmode_t fmode) @ test_libbpf_get_fd_by_id_opts.c:27 9: (95) exit At program exit the register R0 has smax=9223372036854775795 should have been in [-4095, 0] Workaround by adding barrier(). Eventually the verifier will be able to recognize it. Fixes: 5d99e198be27 ("bpf, lsm: Add check for BPF LSM return value") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c b/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c index f5ac5f3e8919..568816307f71 100644 --- a/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c +++ b/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c @@ -31,6 +31,7 @@ int BPF_PROG(check_access, struct bpf_map *map, fmode_t fmode) if (fmode & FMODE_WRITE) return -EACCES; + barrier(); return 0; } From af994e31b75e30941ef32796d0aa8510db00b32a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:03 +0800 Subject: [PATCH 0227/1052] selftests/bpf: Drop make_client in sk_lookup This patch uses the new helper connect_to_addr_str() in sk_lookup.c to create the client socket and connect to the server, instead of using local defined function make_client(). This local function can be dropped then. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/058199d7ab46802249dae066ca22c98f6be508ee.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/sk_lookup.c | 34 ++++--------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index ae87c00867ba..9510a3f83787 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -229,27 +229,6 @@ static int make_server(int sotype, const char *ip, int port, return -1; } -static int make_client(int sotype, const char *ip, int port) -{ - struct sockaddr_storage addr = {0}; - int err, fd; - - fd = make_socket(sotype, ip, port, &addr); - if (fd < 0) - return -1; - - err = connect(fd, (void *)&addr, inetaddr_len(&addr)); - if (CHECK(err, "make_client", "connect")) { - log_err("failed to connect client socket"); - goto fail; - } - - return fd; -fail: - close(fd); - return -1; -} - static __u64 socket_cookie(int fd) { __u64 cookie; @@ -646,8 +625,9 @@ static void run_lookup_prog(const struct test *t) goto close; } - client_fd = make_client(t->sotype, t->connect_to.ip, t->connect_to.port); - if (client_fd < 0) + client_fd = connect_to_addr_str(is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET, + t->sotype, t->connect_to.ip, t->connect_to.port, NULL); + if (!ASSERT_OK_FD(client_fd, "connect_to_addr_str")) goto close; if (t->sotype == SOCK_STREAM) @@ -1152,8 +1132,8 @@ static void run_sk_assign_connected(struct test_sk_lookup *skel, if (server_fd < 0) return; - connected_fd = make_client(sotype, EXT_IP4, EXT_PORT); - if (connected_fd < 0) + connected_fd = connect_to_addr_str(AF_INET, sotype, EXT_IP4, EXT_PORT, NULL); + if (!ASSERT_OK_FD(connected_fd, "connect_to_addr_str")) goto out_close_server; /* Put a connected socket in redirect map */ @@ -1166,8 +1146,8 @@ static void run_sk_assign_connected(struct test_sk_lookup *skel, goto out_close_connected; /* Try to redirect TCP SYN / UDP packet to a connected socket */ - client_fd = make_client(sotype, EXT_IP4, EXT_PORT); - if (client_fd < 0) + client_fd = connect_to_addr_str(AF_INET, sotype, EXT_IP4, EXT_PORT, NULL); + if (!ASSERT_OK_FD(client_fd, "connect_to_addr_str")) goto out_unlink_prog; if (sotype == SOCK_DGRAM) { send_byte(client_fd); From 01c2f776ed372155f9e80bd787324f26601914de Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:04 +0800 Subject: [PATCH 0228/1052] selftests/bpf: Drop make_socket in sk_lookup This patch uses the public network helers client_socket() + make_sockaddr() in sk_lookup.c to create the client socket, set the timeout sockopts, and make the connecting address. The local defined function make_socket() can be dropped then. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/588771977ac48c27f73526d8421a84b91d7cf218.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/sk_lookup.c | 61 ++++++------------- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 9510a3f83787..88f2f9efeea5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -47,8 +47,6 @@ #define INT_IP6 "fd00::2" #define INT_PORT 8008 -#define IO_TIMEOUT_SEC 3 - enum server { SERVER_A = 0, SERVER_B = 1, @@ -114,40 +112,6 @@ static socklen_t inetaddr_len(const struct sockaddr_storage *addr) addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0); } -static int make_socket(int sotype, const char *ip, int port, - struct sockaddr_storage *addr) -{ - struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC }; - int err, family, fd; - - family = is_ipv6(ip) ? AF_INET6 : AF_INET; - err = make_sockaddr(family, ip, port, addr, NULL); - if (CHECK(err, "make_address", "failed\n")) - return -1; - - fd = socket(addr->ss_family, sotype, 0); - if (CHECK(fd < 0, "socket", "failed\n")) { - log_err("failed to make socket"); - return -1; - } - - err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); - if (CHECK(err, "setsockopt(SO_SNDTIMEO)", "failed\n")) { - log_err("failed to set SNDTIMEO"); - close(fd); - return -1; - } - - err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); - if (CHECK(err, "setsockopt(SO_RCVTIMEO)", "failed\n")) { - log_err("failed to set RCVTIMEO"); - close(fd); - return -1; - } - - return fd; -} - static int setsockopts(int fd, void *opts) { struct cb_opts *co = (struct cb_opts *)opts; @@ -842,6 +806,7 @@ static void test_redirect_lookup(struct test_sk_lookup *skel) static void drop_on_lookup(const struct test *t) { + int family = is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET; struct sockaddr_storage dst = {}; int client_fd, server_fd, err; struct bpf_link *lookup_link; @@ -856,11 +821,13 @@ static void drop_on_lookup(const struct test *t) if (server_fd < 0) goto detach; - client_fd = make_socket(t->sotype, t->connect_to.ip, - t->connect_to.port, &dst); - if (client_fd < 0) + client_fd = client_socket(family, t->sotype, NULL); + if (!ASSERT_OK_FD(client_fd, "client_socket")) goto close_srv; + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + if (!ASSERT_OK(err, "make_sockaddr")) + goto close_all; err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); if (t->sotype == SOCK_DGRAM) { err = send_byte(client_fd); @@ -956,6 +923,7 @@ static void test_drop_on_lookup(struct test_sk_lookup *skel) static void drop_on_reuseport(const struct test *t) { + int family = is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET; struct sockaddr_storage dst = { 0 }; int client, server1, server2, err; struct bpf_link *lookup_link; @@ -980,11 +948,13 @@ static void drop_on_reuseport(const struct test *t) if (server2 < 0) goto close_srv1; - client = make_socket(t->sotype, t->connect_to.ip, - t->connect_to.port, &dst); - if (client < 0) + client = client_socket(family, t->sotype, NULL); + if (!ASSERT_OK_FD(client, "client_socket")) goto close_srv2; + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + if (!ASSERT_OK(err, "make_sockaddr")) + goto close_all; err = connect(client, (void *)&dst, inetaddr_len(&dst)); if (t->sotype == SOCK_DGRAM) { err = send_byte(client); @@ -1228,10 +1198,13 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t) if (err) goto out_close_server; - client_fd = make_socket(SOCK_STREAM, EXT_IP4, EXT_PORT, &dst); - if (client_fd < 0) + client_fd = client_socket(AF_INET, SOCK_STREAM, NULL); + if (!ASSERT_OK_FD(client_fd, "client_socket")) goto out_close_server; + err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, NULL); + if (!ASSERT_OK(err, "make_sockaddr")) + goto out_close_client; err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); if (CHECK(err && !t->expect_errno, "connect", "unexpected error %d\n", errno)) From c3c41e016cca7cc6b6d9cd1322fdf9be845ccd06 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:05 +0800 Subject: [PATCH 0229/1052] selftests/bpf: Drop inetaddr_len in sk_lookup No need to use a dedicated helper inetaddr_len() to get the length of the IPv4 or IPv6 address, it can be got by make_sockaddr(), this patch drops it. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/32e2a4122921051da38a6e4fbb2ebee5f0af5a4e.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/sk_lookup.c | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 88f2f9efeea5..96f3863da8bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -106,12 +106,6 @@ static int attach_reuseport(int sock_fd, struct bpf_program *reuseport_prog) return 0; } -static socklen_t inetaddr_len(const struct sockaddr_storage *addr) -{ - return (addr->ss_family == AF_INET ? sizeof(struct sockaddr_in) : - addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0); -} - static int setsockopts(int fd, void *opts) { struct cb_opts *co = (struct cb_opts *)opts; @@ -810,6 +804,7 @@ static void drop_on_lookup(const struct test *t) struct sockaddr_storage dst = {}; int client_fd, server_fd, err; struct bpf_link *lookup_link; + socklen_t len; ssize_t n; lookup_link = attach_lookup_prog(t->lookup_prog); @@ -825,10 +820,10 @@ static void drop_on_lookup(const struct test *t) if (!ASSERT_OK_FD(client_fd, "client_socket")) goto close_srv; - err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, &len); if (!ASSERT_OK(err, "make_sockaddr")) goto close_all; - err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); + err = connect(client_fd, (void *)&dst, len); if (t->sotype == SOCK_DGRAM) { err = send_byte(client_fd); if (err) @@ -927,6 +922,7 @@ static void drop_on_reuseport(const struct test *t) struct sockaddr_storage dst = { 0 }; int client, server1, server2, err; struct bpf_link *lookup_link; + socklen_t len; ssize_t n; lookup_link = attach_lookup_prog(t->lookup_prog); @@ -952,10 +948,10 @@ static void drop_on_reuseport(const struct test *t) if (!ASSERT_OK_FD(client, "client_socket")) goto close_srv2; - err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, &len); if (!ASSERT_OK(err, "make_sockaddr")) goto close_all; - err = connect(client, (void *)&dst, inetaddr_len(&dst)); + err = connect(client, (void *)&dst, len); if (t->sotype == SOCK_DGRAM) { err = send_byte(client); if (err) @@ -1169,6 +1165,7 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t) int map_fd, server_fd, client_fd; struct bpf_link *link1, *link2; int prog_idx, done, err; + socklen_t len; map_fd = bpf_map__fd(t->run_map); @@ -1202,10 +1199,10 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t) if (!ASSERT_OK_FD(client_fd, "client_socket")) goto out_close_server; - err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, NULL); + err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, &len); if (!ASSERT_OK(err, "make_sockaddr")) goto out_close_client; - err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); + err = connect(client_fd, (void *)&dst, len); if (CHECK(err && !t->expect_errno, "connect", "unexpected error %d\n", errno)) goto out_close_client; From 71a2fbaf9c91b0d863e11ec03f7168893904fbab Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:06 +0800 Subject: [PATCH 0230/1052] selftests/bpf: Drop __start_server in network_helpers The helper start_server_addr() is a wrapper of __start_server(), the only difference between them is __start_server() accepts a sockaddr type address parameter, but start_server_addr() accepts a sockaddr_storage one. This patch drops __start_server(), and updates the callers to invoke start_server_addr() instead. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/31399df7cb957b7c233e79963b0aa0dc4278d273.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 9c98a60cf1e2..a3f0a49fb26f 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -80,12 +80,15 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, - const struct network_helper_opts *opts) +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { int fd; - fd = socket(addr->sa_family, type, opts->proto); + if (!opts) + opts = &default_opts; + + fd = socket(addr->ss_family, type, opts->proto); if (fd < 0) { log_err("Failed to create server socket"); return -1; @@ -100,7 +103,7 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl goto error_close; } - if (bind(fd, addr, addrlen) < 0) { + if (bind(fd, (struct sockaddr *)addr, addrlen) < 0) { log_err("Failed to bind socket"); goto error_close; } @@ -131,7 +134,7 @@ int start_server_str(int family, int type, const char *addr_str, __u16 port, if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, (struct sockaddr *)&addr, addrlen, opts); + return start_server_addr(type, &addr, addrlen, opts); } int start_server(int family, int type, const char *addr_str, __u16 port, @@ -173,7 +176,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); + fds[0] = start_server_addr(type, &addr, addrlen, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -182,7 +185,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); + fds[nr_fds] = start_server_addr(type, &addr, addrlen, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -194,15 +197,6 @@ int *start_reuseport_server(int family, int type, const char *addr_str, return NULL; } -int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, - const struct network_helper_opts *opts) -{ - if (!opts) - opts = &default_opts; - - return __start_server(type, (struct sockaddr *)addr, len, opts); -} - void free_fds(int *fds, unsigned int nr_close_fds) { if (fds) { From c7db4873fbd63918c124a5d67182492c78ba705b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 22 Jul 2024 22:14:55 -0700 Subject: [PATCH 0231/1052] selftests/bpf: Add a test for mmap-able map in map Regular BPF hash map is not mmap-able from user space. However, map-in-map with outer map of type BPF_MAP_TYPE_HASH_OF_MAPS and mmap-able array as inner map can perform similar operations as a mmap-able hash map. This can be used by applications that benefit from fast accesses to some local data. Add a selftest to show this use case. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240723051455.1589192-1-song@kernel.org Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- .../bpf/prog_tests/test_mmap_inner_array.c | 57 +++++++++++++++++++ .../selftests/bpf/progs/mmap_inner_array.c | 57 +++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c create mode 100644 tools/testing/selftests/bpf/progs/mmap_inner_array.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c b/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c new file mode 100644 index 000000000000..ce745776ed18 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "mmap_inner_array.skel.h" + +void test_mmap_inner_array(void) +{ + const long page_size = sysconf(_SC_PAGE_SIZE); + struct mmap_inner_array *skel; + int inner_array_fd, err; + void *tmp; + __u64 *val; + + skel = mmap_inner_array__open_and_load(); + + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + inner_array_fd = bpf_map__fd(skel->maps.inner_array); + tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, inner_array_fd, 0); + if (!ASSERT_OK_PTR(tmp, "inner array mmap")) + goto out; + val = (void *)tmp; + + err = mmap_inner_array__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto out_unmap; + + skel->bss->pid = getpid(); + usleep(1); + + /* pid is set, pid_match == true and outer_map_match == false */ + ASSERT_TRUE(skel->bss->pid_match, "pid match 1"); + ASSERT_FALSE(skel->bss->outer_map_match, "outer map match 1"); + ASSERT_FALSE(skel->bss->done, "done 1"); + ASSERT_EQ(*val, 0, "value match 1"); + + err = bpf_map__update_elem(skel->maps.outer_map, + &skel->bss->pid, sizeof(skel->bss->pid), + &inner_array_fd, sizeof(inner_array_fd), + BPF_ANY); + if (!ASSERT_OK(err, "update elem")) + goto out_unmap; + usleep(1); + + /* outer map key is set, outer_map_match == true */ + ASSERT_TRUE(skel->bss->pid_match, "pid match 2"); + ASSERT_TRUE(skel->bss->outer_map_match, "outer map match 2"); + ASSERT_TRUE(skel->bss->done, "done 2"); + ASSERT_EQ(*val, skel->data->match_value, "value match 2"); + +out_unmap: + munmap(tmp, page_size); +out: + mmap_inner_array__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/mmap_inner_array.c b/tools/testing/selftests/bpf/progs/mmap_inner_array.c new file mode 100644 index 000000000000..90aacbc2938a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mmap_inner_array.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include + +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct inner_array_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 1); +} inner_array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); + __array(values, struct inner_array_type); +} outer_map SEC(".maps"); + +int pid = 0; +__u64 match_value = 0x13572468; +bool done = false; +bool pid_match = false; +bool outer_map_match = false; + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int add_to_list_in_inner_array(void *ctx) +{ + __u32 curr_pid, zero = 0; + struct bpf_map *map; + __u64 *value; + + curr_pid = (u32)bpf_get_current_pid_tgid(); + if (done || curr_pid != pid) + return 0; + + pid_match = true; + map = bpf_map_lookup_elem(&outer_map, &curr_pid); + if (!map) + return 0; + + outer_map_match = true; + value = bpf_map_lookup_elem(map, &zero); + if (!value) + return 0; + + *value = match_value; + done = true; + return 0; +} From a0ef659d03d2fc66c270cee791a1c5ebc0ff3bac Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 23 Jul 2024 03:07:00 +0000 Subject: [PATCH 0232/1052] selftests/bpf: Don't include .d files on make clean Ignore generated %.test.o dependencies when make goal is clean or docs-clean. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/all/oNTIdax7aWGJdEgabzTqHzF4r-WTERrV1e1cNaPQMp-UhYUQpozXqkbuAlLBulczr6I99-jM5x3dxv56JJowaYBkm765R9Aa9kyrVuCl_kA=@pm.me Link: https://lore.kernel.org/bpf/K69Y8OKMLXBWR0dtOfsC4J46-HxeQfvqoFx1CysCm7u19HRx4MB6yAKOFkM6X-KAx2EFuCcCh_9vYWpsgQXnAer8oQ8PMeDEuiRMYECuGH4=@pm.me --- tools/testing/selftests/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 05b234248b38..74f829952280 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -608,7 +608,9 @@ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) +ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),) include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) +endif $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ From c7ad90736763b4e6bed10788ec07020a4f15dc32 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 17:13:28 -0700 Subject: [PATCH 0233/1052] selftests/bpf: Add missing system defines for mips Update get_sys_includes in Makefile with missing MIPS-related definitions to fix many, many compilation errors building selftests/bpf. The following added defines drive conditional logic in system headers for word-size and endianness selection: MIPSEL, MIPSEB _MIPS_SZPTR _MIPS_SZLONG _MIPS_SIM, _ABIO32, _ABIN32, _ABI64 Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/f3cfceaf5299cdd2ac0e0a36072d6ca7be23e603.1721692479.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 74f829952280..a7c797dd075f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -396,7 +396,8 @@ define get_sys_includes $(shell $(1) $(2) -v -E - &1 \ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ $(shell $(1) $(2) -dM -E - Date: Mon, 22 Jul 2024 22:27:57 +0200 Subject: [PATCH 0234/1052] selftests/bpf: Add uprobe fail tests for uprobe multi Adding tests for checking on recovery after failing to attach uprobe. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240722202758.3889061-2-jolsa@kernel.org --- .../bpf/prog_tests/uprobe_multi_test.c | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index bf6ca8e3eb13..e6255d4df81d 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -516,6 +516,122 @@ static void test_attach_api_fails(void) uprobe_multi__destroy(skel); } +#ifdef __x86_64__ +noinline void uprobe_multi_error_func(void) +{ + /* + * If --fcf-protection=branch is enabled the gcc generates endbr as + * first instruction, so marking the exact address of int3 with the + * symbol to be used in the attach_uprobe_fail_trap test below. + */ + asm volatile ( + ".globl uprobe_multi_error_func_int3; \n" + "uprobe_multi_error_func_int3: \n" + "int3 \n" + ); +} + +/* + * Attaching uprobe on uprobe_multi_error_func results in error + * because it already starts with int3 instruction. + */ +static void attach_uprobe_fail_trap(struct uprobe_multi *skel) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + const char *syms[4] = { + "uprobe_multi_func_1", + "uprobe_multi_func_2", + "uprobe_multi_func_3", + "uprobe_multi_error_func_int3", + }; + + opts.syms = syms; + opts.cnt = ARRAY_SIZE(syms); + + skel->links.uprobe = bpf_program__attach_uprobe_multi(skel->progs.uprobe, -1, + "/proc/self/exe", NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.uprobe, "bpf_program__attach_uprobe_multi")) { + bpf_link__destroy(skel->links.uprobe); + skel->links.uprobe = NULL; + } +} +#else +static void attach_uprobe_fail_trap(struct uprobe_multi *skel) { } +#endif + +short sema_1 __used, sema_2 __used; + +static void attach_uprobe_fail_refctr(struct uprobe_multi *skel) +{ + unsigned long *tmp_offsets = NULL, *tmp_ref_ctr_offsets = NULL; + unsigned long offsets[3], ref_ctr_offsets[3]; + LIBBPF_OPTS(bpf_link_create_opts, opts); + const char *path = "/proc/self/exe"; + const char *syms[3] = { + "uprobe_multi_func_1", + "uprobe_multi_func_2", + }; + const char *sema[3] = { + "sema_1", + "sema_2", + }; + int prog_fd, link_fd, err; + + prog_fd = bpf_program__fd(skel->progs.uprobe_extra); + + err = elf_resolve_syms_offsets("/proc/self/exe", 2, (const char **) &syms, + &tmp_offsets, STT_FUNC); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_func")) + return; + + err = elf_resolve_syms_offsets("/proc/self/exe", 2, (const char **) &sema, + &tmp_ref_ctr_offsets, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + goto cleanup; + + /* + * We attach to 3 uprobes on 2 functions, so 2 uprobes share single function, + * but with different ref_ctr_offset which is not allowed and results in fail. + */ + offsets[0] = tmp_offsets[0]; /* uprobe_multi_func_1 */ + offsets[1] = tmp_offsets[1]; /* uprobe_multi_func_2 */ + offsets[2] = tmp_offsets[1]; /* uprobe_multi_func_2 */ + + ref_ctr_offsets[0] = tmp_ref_ctr_offsets[0]; /* sema_1 */ + ref_ctr_offsets[1] = tmp_ref_ctr_offsets[1]; /* sema_2 */ + ref_ctr_offsets[2] = tmp_ref_ctr_offsets[0]; /* sema_1, error */ + + opts.uprobe_multi.path = path; + opts.uprobe_multi.offsets = (const unsigned long *) &offsets; + opts.uprobe_multi.ref_ctr_offsets = (const unsigned long *) &ref_ctr_offsets; + opts.uprobe_multi.cnt = 3; + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_ERR(link_fd, "link_fd")) + close(link_fd); + +cleanup: + free(tmp_ref_ctr_offsets); + free(tmp_offsets); +} + +static void test_attach_uprobe_fails(void) +{ + struct uprobe_multi *skel = NULL; + + skel = uprobe_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi__open_and_load")) + return; + + /* attach fails due to adding uprobe on trap instruction, x86_64 only */ + attach_uprobe_fail_trap(skel); + + /* attach fail due to wrong ref_ctr_offs on one of the uprobes */ + attach_uprobe_fail_refctr(skel); + + uprobe_multi__destroy(skel); +} + static void __test_link_api(struct child *child) { int prog_fd, link1_fd = -1, link2_fd = -1, link3_fd = -1, link4_fd = -1; @@ -703,4 +819,6 @@ void test_uprobe_multi_test(void) test_bench_attach_usdt(); if (test__start_subtest("attach_api_fails")) test_attach_api_fails(); + if (test__start_subtest("attach_uprobe_fails")) + test_attach_uprobe_fails(); } From a5f40d596bff182b4b47547712f540885e8fb17b Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 17:13:29 -0700 Subject: [PATCH 0235/1052] selftests/bpf: Fix error linking uprobe_multi on mips Linking uprobe_multi.c on mips64el fails due to relocation overflows, when the GOT entries required exceeds the default maximum. Add a specific CFLAGS (-mxgot) for uprobe_multi.c on MIPS that allows using a larger GOT and avoids errors such as: /tmp/ccBTNQzv.o: in function `bench': uprobe_multi.c:49:(.text+0x1d7720): relocation truncated to fit: R_MIPS_GOT_DISP against `uprobe_multi_func_08188' uprobe_multi.c:49:(.text+0x1d7730): relocation truncated to fit: R_MIPS_GOT_DISP against `uprobe_multi_func_08189' ... collect2: error: ld returned 1 exit status Fixes: 519dfeaf5119 ("selftests/bpf: Add uprobe_multi test program") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/14eb7b70f8ccef9834874d75eb373cb9292129da.1721692479.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a7c797dd075f..8e01e4f9b1a7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -786,6 +786,8 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ +# Linking uprobe_multi can fail due to relocation overflows on mips. +$(OUTPUT)/uprobe_multi: CFLAGS += $(if $(filter mips, $(ARCH)),-mxgot) $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ From 98adc743ae1febb401d635bebe46667af2692751 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 22 Jul 2024 22:27:58 +0200 Subject: [PATCH 0236/1052] selftests/bpf: Add uprobe multi consumers test Adding test that attaches/detaches multiple consumers on single uprobe and verifies all were hit as expected. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240722202758.3889061-3-jolsa@kernel.org --- .../bpf/prog_tests/uprobe_multi_test.c | 213 ++++++++++++++++++ .../bpf/progs/uprobe_multi_consumers.c | 39 ++++ 2 files changed, 252 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index e6255d4df81d..acb62675ff65 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -6,6 +6,7 @@ #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" #include "uprobe_multi_usdt.skel.h" +#include "uprobe_multi_consumers.skel.h" #include "bpf/libbpf_internal.h" #include "testing_helpers.h" #include "../sdt.h" @@ -731,6 +732,216 @@ static void test_link_api(void) __test_link_api(child); } +static struct bpf_program * +get_program(struct uprobe_multi_consumers *skel, int prog) +{ + switch (prog) { + case 0: + return skel->progs.uprobe_0; + case 1: + return skel->progs.uprobe_1; + case 2: + return skel->progs.uprobe_2; + case 3: + return skel->progs.uprobe_3; + default: + ASSERT_FAIL("get_program"); + return NULL; + } +} + +static struct bpf_link ** +get_link(struct uprobe_multi_consumers *skel, int link) +{ + switch (link) { + case 0: + return &skel->links.uprobe_0; + case 1: + return &skel->links.uprobe_1; + case 2: + return &skel->links.uprobe_2; + case 3: + return &skel->links.uprobe_3; + default: + ASSERT_FAIL("get_link"); + return NULL; + } +} + +static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx) +{ + struct bpf_program *prog = get_program(skel, idx); + struct bpf_link **link = get_link(skel, idx); + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + + if (!prog || !link) + return -1; + + /* + * bit/prog: 0,1 uprobe entry + * bit/prog: 2,3 uprobe return + */ + opts.retprobe = idx == 2 || idx == 3; + + *link = bpf_program__attach_uprobe_multi(prog, 0, "/proc/self/exe", + "uprobe_consumer_test", + &opts); + if (!ASSERT_OK_PTR(*link, "bpf_program__attach_uprobe_multi")) + return -1; + return 0; +} + +static void uprobe_detach(struct uprobe_multi_consumers *skel, int idx) +{ + struct bpf_link **link = get_link(skel, idx); + + bpf_link__destroy(*link); + *link = NULL; +} + +static bool test_bit(int bit, unsigned long val) +{ + return val & (1 << bit); +} + +noinline int +uprobe_consumer_test(struct uprobe_multi_consumers *skel, + unsigned long before, unsigned long after) +{ + int idx; + + /* detach uprobe for each unset programs in 'before' state ... */ + for (idx = 0; idx < 4; idx++) { + if (test_bit(idx, before) && !test_bit(idx, after)) + uprobe_detach(skel, idx); + } + + /* ... and attach all new programs in 'after' state */ + for (idx = 0; idx < 4; idx++) { + if (!test_bit(idx, before) && test_bit(idx, after)) { + if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_after")) + return -1; + } + } + return 0; +} + +static void consumer_test(struct uprobe_multi_consumers *skel, + unsigned long before, unsigned long after) +{ + int err, idx; + + printf("consumer_test before %lu after %lu\n", before, after); + + /* 'before' is each, we attach uprobe for every set idx */ + for (idx = 0; idx < 4; idx++) { + if (test_bit(idx, before)) { + if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_before")) + goto cleanup; + } + } + + err = uprobe_consumer_test(skel, before, after); + if (!ASSERT_EQ(err, 0, "uprobe_consumer_test")) + goto cleanup; + + for (idx = 0; idx < 4; idx++) { + const char *fmt = "BUG"; + __u64 val = 0; + + if (idx < 2) { + /* + * uprobe entry + * +1 if define in 'before' + */ + if (test_bit(idx, before)) + val++; + fmt = "prog 0/1: uprobe"; + } else { + /* + * uprobe return is tricky ;-) + * + * to trigger uretprobe consumer, the uretprobe needs to be installed, + * which means one of the 'return' uprobes was alive when probe was hit: + * + * idxs: 2/3 uprobe return in 'installed' mask + * + * in addition if 'after' state removes everything that was installed in + * 'before' state, then uprobe kernel object goes away and return uprobe + * is not installed and we won't hit it even if it's in 'after' state. + */ + unsigned long had_uretprobes = before & 0b1100; /* is uretprobe installed */ + unsigned long probe_preserved = before & after; /* did uprobe go away */ + + if (had_uretprobes && probe_preserved && test_bit(idx, after)) + val++; + fmt = "idx 2/3: uretprobe"; + } + + ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt); + skel->bss->uprobe_result[idx] = 0; + } + +cleanup: + for (idx = 0; idx < 4; idx++) + uprobe_detach(skel, idx); +} + +static void test_consumers(void) +{ + struct uprobe_multi_consumers *skel; + int before, after; + + skel = uprobe_multi_consumers__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi_consumers__open_and_load")) + return; + + /* + * The idea of this test is to try all possible combinations of + * uprobes consumers attached on single function. + * + * - 2 uprobe entry consumer + * - 2 uprobe exit consumers + * + * The test uses 4 uprobes attached on single function, but that + * translates into single uprobe with 4 consumers in kernel. + * + * The before/after values present the state of attached consumers + * before and after the probed function: + * + * bit/prog 0,1 : uprobe entry + * bit/prog 2,3 : uprobe return + * + * For example for: + * + * before = 0b0101 + * after = 0b0110 + * + * it means that before we call 'uprobe_consumer_test' we attach + * uprobes defined in 'before' value: + * + * - bit/prog 0: uprobe entry + * - bit/prog 2: uprobe return + * + * uprobe_consumer_test is called and inside it we attach and detach + * uprobes based on 'after' value: + * + * - bit/prog 0: stays untouched + * - bit/prog 2: uprobe return is detached + * + * uprobe_consumer_test returns and we check counters values increased + * by bpf programs on each uprobe to match the expected count based on + * before/after bits. + */ + + for (before = 0; before < 16; before++) { + for (after = 0; after < 16; after++) + consumer_test(skel, before, after); + } + + uprobe_multi_consumers__destroy(skel); +} + static void test_bench_attach_uprobe(void) { long attach_start_ns = 0, attach_end_ns = 0; @@ -821,4 +1032,6 @@ void test_uprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_uprobe_fails")) test_attach_uprobe_fails(); + if (test__start_subtest("consumers")) + test_consumers(); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c new file mode 100644 index 000000000000..7e0fdcbbd242 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +__u64 uprobe_result[4]; + +SEC("uprobe.multi") +int uprobe_0(struct pt_regs *ctx) +{ + uprobe_result[0]++; + return 0; +} + +SEC("uprobe.multi") +int uprobe_1(struct pt_regs *ctx) +{ + uprobe_result[1]++; + return 0; +} + +SEC("uprobe.multi") +int uprobe_2(struct pt_regs *ctx) +{ + uprobe_result[2]++; + return 0; +} + +SEC("uprobe.multi") +int uprobe_3(struct pt_regs *ctx) +{ + uprobe_result[3]++; + return 0; +} From d17f9b370df628a3faaec7fd8be92a89ecc3e33a Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Tue, 23 Jul 2024 09:10:31 +0200 Subject: [PATCH 0237/1052] selftests/bpf: Fix compilation failure when CONFIG_NET_FOU!=y Without CONFIG_NET_FOU bpf selftests are unable to build because of missing definitions. Add ___local versions of struct bpf_fou_encap and enum bpf_fou_encap_type to fix the issue. Signed-off-by: Artem Savkov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240723071031.3389423-1-asavkov@redhat.com --- .../selftests/bpf/progs/test_tunnel_kern.c | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 3f5abcf3ff13..32127f1cd687 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -26,6 +26,18 @@ */ #define ASSIGNED_ADDR_VETH1 0xac1001c8 +struct bpf_fou_encap___local { + __be16 sport; + __be16 dport; +} __attribute__((preserve_access_index)); + +enum bpf_fou_encap_type___local { + FOU_BPF_ENCAP_FOU___local, + FOU_BPF_ENCAP_GUE___local, +}; + +struct bpf_fou_encap; + int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, struct bpf_fou_encap *encap, int type) __ksym; int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, @@ -745,7 +757,7 @@ SEC("tc") int ipip_gue_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key = {}; - struct bpf_fou_encap encap = {}; + struct bpf_fou_encap___local encap = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; void *data_end = (void *)(long)skb->data_end; @@ -769,7 +781,9 @@ int ipip_gue_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_GUE); + ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + bpf_core_enum_value(enum bpf_fou_encap_type___local, + FOU_BPF_ENCAP_GUE___local)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -782,7 +796,7 @@ SEC("tc") int ipip_fou_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key = {}; - struct bpf_fou_encap encap = {}; + struct bpf_fou_encap___local encap = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; void *data_end = (void *)(long)skb->data_end; @@ -806,7 +820,8 @@ int ipip_fou_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_FOU); + ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + FOU_BPF_ENCAP_FOU___local); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -820,7 +835,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb) { int ret; struct bpf_tunnel_key key = {}; - struct bpf_fou_encap encap = {}; + struct bpf_fou_encap___local encap = {}; ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); if (ret < 0) { @@ -828,7 +843,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_get_fou_encap(skb, &encap); + ret = bpf_skb_get_fou_encap(skb, (struct bpf_fou_encap *)&encap); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; From 3ece93a4087b2db7b99ebb2412bd60cf26bbbb51 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Fri, 19 Jul 2024 22:25:35 -0700 Subject: [PATCH 0238/1052] selftests/bpf: Fix wrong binary in Makefile log output Make log output incorrectly shows 'test_maps' as the binary name for every 'CLNG-BPF' build step, apparently picking up the last value defined for the $(TRUNNER_BINARY) variable. Update the 'CLANG_BPF_BUILD_RULE' variants to fix this confusing output. Current output: CLNG-BPF [test_maps] access_map_in_map.bpf.o GEN-SKEL [test_progs] access_map_in_map.skel.h ... CLNG-BPF [test_maps] access_map_in_map.bpf.o GEN-SKEL [test_progs-no_alu32] access_map_in_map.skel.h ... CLNG-BPF [test_maps] access_map_in_map.bpf.o GEN-SKEL [test_progs-cpuv4] access_map_in_map.skel.h After fix: CLNG-BPF [test_progs] access_map_in_map.bpf.o GEN-SKEL [test_progs] access_map_in_map.skel.h ... CLNG-BPF [test_progs-no_alu32] access_map_in_map.bpf.o GEN-SKEL [test_progs-no_alu32] access_map_in_map.skel.h ... CLNG-BPF [test_progs-cpuv4] access_map_in_map.bpf.o GEN-SKEL [test_progs-cpuv4] access_map_in_map.skel.h Fixes: a5d0c26a2784 ("selftests/bpf: Add a cpuv4 test runner for cpu=v4 testing") Fixes: 89ad7420b25c ("selftests/bpf: Drop the need for LLVM's llc") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240720052535.2185967-1-tony.ambardar@gmail.com --- tools/testing/selftests/bpf/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8e01e4f9b1a7..3318d5425d75 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -428,23 +428,24 @@ $(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h # $1 - input .c file # $2 - output .o file # $3 - CFLAGS +# $4 - binary name define CLANG_BPF_BUILD_RULE - $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-BPF,$4,$2) $(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v3 -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE - $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-BPF,$4,$2) $(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v2 -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with cpu-v4 define CLANG_CPUV4_BPF_BUILD_RULE - $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-BPF,$4,$2) $(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v4 -o $2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE - $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) + $(call msg,GCC-BPF,$4,$2) $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 endef @@ -537,7 +538,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ $$($$<-CFLAGS) \ - $$($$<-$2-CFLAGS)) + $$($$<-$2-CFLAGS),$(TRUNNER_BINARY)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) From f86601c3661946721e8f260bdd812b759854ac22 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 17:30:45 -0700 Subject: [PATCH 0239/1052] tools/runqslower: Fix LDFLAGS and add LDLIBS support Actually use previously defined LDFLAGS during build and add support for LDLIBS to link extra standalone libraries e.g. 'argp' which is not provided by musl libc. Fixes: 585bf4640ebe ("tools: runqslower: Add EXTRA_CFLAGS and EXTRA_LDFLAGS support") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/20240723003045.2273499-1-tony.ambardar@gmail.com --- tools/bpf/runqslower/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index d8288936c912..c4f1f1735af6 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -15,6 +15,7 @@ INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi) CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS) CFLAGS += $(EXTRA_CFLAGS) LDFLAGS += $(EXTRA_LDFLAGS) +LDLIBS += -lelf -lz # Try to detect best kernel BTF source KERNEL_REL := $(shell uname -r) @@ -51,7 +52,7 @@ clean: libbpf_hdrs: $(BPFOBJ) $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) - $(QUIET_LINK)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o | libbpf_hdrs From 45cbc7a5e004cf08528ef83633c62120cca3a5ee Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:35 -0700 Subject: [PATCH 0240/1052] bpf: add a get_helper_proto() utility function Extract the part of check_helper_call() as a utility function allowing to query 'struct bpf_func_proto' for a specific helper function id. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4759c0ba192b..3ed67452f8c6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10369,6 +10369,19 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno state->callback_subprogno == subprogno); } +static int get_helper_proto(struct bpf_verifier_env *env, int func_id, + const struct bpf_func_proto **ptr) +{ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) + return -ERANGE; + + if (!env->ops->get_func_proto) + return -EINVAL; + + *ptr = env->ops->get_func_proto(func_id, env->prog); + return *ptr ? 0 : -EINVAL; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -10385,18 +10398,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* find function prototype */ func_id = insn->imm; - if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose(env, "invalid func %s#%d\n", func_id_name(func_id), - func_id); + err = get_helper_proto(env, insn->imm, &fn); + if (err == -ERANGE) { + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } - if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id, env->prog); - if (!fn) { + if (err) { verbose(env, "program of this type cannot use helper %s#%d\n", func_id_name(func_id), func_id); - return -EINVAL; + return err; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ From 92de36080c93296ef9005690705cba260b9bd68a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 08:34:39 -0700 Subject: [PATCH 0241/1052] bpf: Fail verification for sign-extension of packet data/data_end/data_meta syzbot reported a kernel crash due to commit 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses"). The reason is due to sign-extension of 32-bit load for packet data/data_end/data_meta uapi field. The original code looks like: r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */ r3 = *(u32 *)(r1 + 80) /* load __sk_buff->data_end */ r0 = r2 r0 += 8 if r3 > r0 goto +1 ... Note that __sk_buff->data load has 32-bit sign extension. After verification and convert_ctx_accesses(), the final asm code looks like: r2 = *(u64 *)(r1 +208) r2 = (s32)r2 r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 ... Note that 'r2 = (s32)r2' may make the kernel __sk_buff->data address invalid which may cause runtime failure. Currently, in C code, typically we have void *data = (void *)(long)skb->data; void *data_end = (void *)(long)skb->data_end; ... and it will generate r2 = *(u64 *)(r1 +208) r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 If we allow sign-extension, void *data = (void *)(long)(int)skb->data; void *data_end = (void *)(long)skb->data_end; ... the generated code looks like r2 = *(u64 *)(r1 +208) r2 <<= 32 r2 s>>= 32 r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 and this will cause verification failure since "r2 <<= 32" is not allowed as "r2" is a packet pointer. To fix this issue for case r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */ this patch added additional checking in is_valid_access() callback function for packet data/data_end/data_meta access. If those accesses are with sign-extenstion, the verification will fail. [1] https://lore.kernel.org/bpf/000000000000c90eee061d236d37@google.com/ Reported-by: syzbot+ad9ec60c8eaf69e6f99c@syzkaller.appspotmail.com Fixes: 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses") Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723153439.2429035-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 5 +++-- net/core/filter.c | 21 ++++++++++++++++----- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f16d0753f518..f560ea0c2b36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -920,6 +920,7 @@ static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; + bool is_ldsx; union { int ctx_field_size; struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c419005a29dc..2ab59db5837f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5626,12 +5626,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id, bool *is_retval) + struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, .log = &env->log, .is_retval = false, + .is_ldsx = is_ldsx, }; if (env->ops->is_valid_access && @@ -6945,7 +6946,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id, &is_retval); + &btf_id, &is_retval, is_ldsx); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { diff --git a/net/core/filter.c b/net/core/filter.c index f3c72cf86099..78a6f746ea0b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8579,13 +8579,16 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + if (info->is_ldsx || size != size_default) + return false; + break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): - case bpf_ctx_range(struct __sk_buff, data): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; break; @@ -9029,6 +9032,14 @@ static bool xdp_is_valid_access(int off, int size, } } return false; + } else { + switch (off) { + case offsetof(struct xdp_md, data_meta): + case offsetof(struct xdp_md, data): + case offsetof(struct xdp_md, data_end): + if (info->is_ldsx) + return false; + } } switch (off) { @@ -9354,12 +9365,12 @@ static bool flow_dissector_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, data): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; From 5b5f51bff1b66cedb62b5ba74a1878341204e057 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:36 -0700 Subject: [PATCH 0242/1052] bpf: no_caller_saved_registers attribute for helper calls GCC and LLVM define a no_caller_saved_registers function attribute. This attribute means that function scratches only some of the caller saved registers defined by ABI. For BPF the set of such registers could be defined as follows: - R0 is scratched only if function is non-void; - R1-R5 are scratched only if corresponding parameter type is defined in the function prototype. This commit introduces flag bpf_func_prot->allow_nocsr. If this flag is set for some helper function, verifier assumes that it follows no_caller_saved_registers calling convention. The contract between kernel and clang allows to simultaneously use such functions and maintain backwards compatibility with old kernels that don't understand no_caller_saved_registers calls (nocsr for short): - clang generates a simple pattern for nocsr calls, e.g.: r1 = 1; r2 = 2; *(u64 *)(r10 - 8) = r1; *(u64 *)(r10 - 16) = r2; call %[to_be_inlined] r2 = *(u64 *)(r10 - 16); r1 = *(u64 *)(r10 - 8); r0 = r1; r0 += r2; exit; - kernel removes unnecessary spills and fills, if called function is inlined by verifier or current JIT (with assumption that patch inserted by verifier or JIT honors nocsr contract, e.g. does not scratch r3-r5 for the example above), e.g. the code above would be transformed to: r1 = 1; r2 = 2; call %[to_be_inlined] r0 = r1; r0 += r2; exit; Technically, the transformation is split into the following phases: - function mark_nocsr_patterns(), called from bpf_check() searches and marks potential patterns in instruction auxiliary data; - upon stack read or write access, function check_nocsr_stack_contract() is used to verify if stack offsets, presumably reserved for nocsr patterns, are used only from those patterns; - function remove_nocsr_spills_fills(), called from bpf_check(), applies the rewrite for valid patterns. See comment in mark_nocsr_pattern_for_call() for more details. Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 6 + include/linux/bpf_verifier.h | 14 ++ kernel/bpf/verifier.c | 317 ++++++++++++++++++++++++++++++++++- 3 files changed, 334 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f560ea0c2b36..b9425e410bcb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -808,6 +808,12 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; bool might_sleep; + /* set to true if helper follows contract for gcc/llvm + * attribute no_caller_saved_registers: + * - void functions do not scratch r0 + * - functions taking N arguments scratch only registers r1-rN + */ + bool allow_nocsr; enum bpf_return_type ret_type; union { struct { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 731a0a4ac13c..5cea15c81b8a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -576,6 +576,14 @@ struct bpf_insn_aux_data { bool is_iter_next; /* bpf_iter__next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ + /* true if STX or LDX instruction is a part of a spill/fill + * pattern for a no_caller_saved_registers call. + */ + u8 nocsr_pattern:1; + /* for CALL instructions, a number of spill/fill pairs in the + * no_caller_saved_registers pattern. + */ + u8 nocsr_spills_num:3; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ @@ -645,6 +653,10 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; + /* offsets in range [stack_depth .. nocsr_stack_off) + * are used for no_caller_saved_registers spills and fills. + */ + s16 nocsr_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; @@ -652,6 +664,8 @@ struct bpf_subprog_info { bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; + /* true if nocsr stack region is used by functions that can't be inlined */ + bool keep_nocsr_stack: 1; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3ed67452f8c6..7587336967cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4579,6 +4579,31 @@ static int get_reg_width(struct bpf_reg_state *reg) return fls64(reg->umax_value); } +/* See comment for mark_nocsr_pattern_for_call() */ +static void check_nocsr_stack_contract(struct bpf_verifier_env *env, struct bpf_func_state *state, + int insn_idx, int off) +{ + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i; + + if (subprog->nocsr_stack_off <= off || aux[insn_idx].nocsr_pattern) + return; + /* access to the region [max_stack_depth .. nocsr_stack_off) + * from something that is not a part of the nocsr pattern, + * disable nocsr rewrites for current subprogram by setting + * nocsr_stack_off to a value smaller than any possible offset. + */ + subprog->nocsr_stack_off = S16_MIN; + /* reset nocsr aux flags within subprogram, + * happens at most once per subprogram + */ + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + aux[i].nocsr_spills_num = 0; + aux[i].nocsr_pattern = 0; + } +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -4627,6 +4652,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; + check_nocsr_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { bool reg_value_fits; @@ -4761,6 +4787,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, return err; } + check_nocsr_stack_contract(env, state, insn_idx, min_off); /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { u8 new_type, *stype; @@ -4899,6 +4926,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); + check_nocsr_stack_contract(env, state, env->insn_idx, off); if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -5059,6 +5087,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, min_off = reg->smin_value + off; max_off = reg->smax_value + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + check_nocsr_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; } @@ -6772,10 +6801,20 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - int min_valid_off; + struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; + int min_valid_off, max_bpf_stack; + + /* If accessing instruction is a spill/fill from nocsr pattern, + * add room for all caller saved registers below MAX_BPF_STACK. + * In case if nocsr rewrite won't happen maximal stack depth + * would be checked by check_max_stack_depth_subprog(). + */ + max_bpf_stack = MAX_BPF_STACK; + if (aux->nocsr_pattern) + max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -MAX_BPF_STACK; + min_valid_off = -max_bpf_stack; else min_valid_off = -state->allocated_stack; @@ -16061,6 +16100,232 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, return ret; } +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Return a bitmask specifying which caller saved registers are + * clobbered by a call to a helper *as if* this helper follows + * no_caller_saved_registers contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ +static u32 helper_nocsr_clobber_mask(const struct bpf_func_proto *fn) +{ + u8 mask; + int i; + + mask = 0; + if (fn->ret_type != RET_VOID) + mask |= BIT(BPF_REG_0); + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) + if (fn->arg_type[i] != ARG_DONTCARE) + mask |= BIT(BPF_REG_1 + i); + return mask; +} + +/* True if do_misc_fixups() replaces calls to helper number 'imm', + * replacement patch is presumed to follow no_caller_saved_registers contract + * (see mark_nocsr_pattern_for_call() below). + */ +static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) +{ + return false; +} + +/* GCC and LLVM define a no_caller_saved_registers function attribute. + * This attribute means that function scratches only some of + * the caller saved registers defined by ABI. + * For BPF the set of such registers could be defined as follows: + * - R0 is scratched only if function is non-void; + * - R1-R5 are scratched only if corresponding parameter type is defined + * in the function prototype. + * + * The contract between kernel and clang allows to simultaneously use + * such functions and maintain backwards compatibility with old + * kernels that don't understand no_caller_saved_registers calls + * (nocsr for short): + * + * - for nocsr calls clang allocates registers as-if relevant r0-r5 + * registers are not scratched by the call; + * + * - as a post-processing step, clang visits each nocsr call and adds + * spill/fill for every live r0-r5; + * + * - stack offsets used for the spill/fill are allocated as lowest + * stack offsets in whole function and are not used for any other + * purposes; + * + * - when kernel loads a program, it looks for such patterns + * (nocsr function surrounded by spills/fills) and checks if + * spill/fill stack offsets are used exclusively in nocsr patterns; + * + * - if so, and if verifier or current JIT inlines the call to the + * nocsr function (e.g. a helper call), kernel removes unnecessary + * spill/fill pairs; + * + * - when old kernel loads a program, presence of spill/fill pairs + * keeps BPF program valid, albeit slightly less efficient. + * + * For example: + * + * r1 = 1; + * r2 = 2; + * *(u64 *)(r10 - 8) = r1; r1 = 1; + * *(u64 *)(r10 - 16) = r2; r2 = 2; + * call %[to_be_inlined] --> call %[to_be_inlined] + * r2 = *(u64 *)(r10 - 16); r0 = r1; + * r1 = *(u64 *)(r10 - 8); r0 += r2; + * r0 = r1; exit; + * r0 += r2; + * exit; + * + * The purpose of mark_nocsr_pattern_for_call is to: + * - look for such patterns; + * - mark spill and fill instructions in env->insn_aux_data[*].nocsr_pattern; + * - mark set env->insn_aux_data[*].nocsr_spills_num for call instruction; + * - update env->subprog_info[*]->nocsr_stack_off to find an offset + * at which nocsr spill/fill stack slots start; + * - update env->subprog_info[*]->keep_nocsr_stack. + * + * The .nocsr_pattern and .nocsr_stack_off are used by + * check_nocsr_stack_contract() to check if every stack access to + * nocsr spill/fill stack slot originates from spill/fill + * instructions, members of nocsr patterns. + * + * If such condition holds true for a subprogram, nocsr patterns could + * be rewritten by remove_nocsr_spills_fills(). + * Otherwise nocsr patterns are not changed in the subprogram + * (code, presumably, generated by an older clang version). + * + * For example, it is *not* safe to remove spill/fill below: + * + * r1 = 1; + * *(u64 *)(r10 - 8) = r1; r1 = 1; + * call %[to_be_inlined] --> call %[to_be_inlined] + * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!! + * r0 = *(u64 *)(r10 - 8); r0 += r1; + * r0 += r1; exit; + * exit; + */ +static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, + struct bpf_subprog_info *subprog, + int insn_idx, s16 lowest_off) +{ + struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; + struct bpf_insn *call = &env->prog->insnsi[insn_idx]; + const struct bpf_func_proto *fn; + u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 expected_regs_mask; + bool can_be_inlined = false; + s16 off; + int i; + + if (bpf_helper_call(call)) { + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return; + clobbered_regs_mask = helper_nocsr_clobber_mask(fn); + can_be_inlined = fn->allow_nocsr && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + } + + if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + return; + + /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ + expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; + + /* match pairs of form: + * + * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0) + * ... + * call %[to_be_inlined] + * ... + * rX = *(u64 *)(r10 - Y) + */ + for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) { + if (insn_idx - i < 0 || insn_idx + i >= env->prog->len) + break; + stx = &insns[insn_idx - i]; + ldx = &insns[insn_idx + i]; + /* must be a stack spill/fill pair */ + if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) || + ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) || + stx->dst_reg != BPF_REG_10 || + ldx->src_reg != BPF_REG_10) + break; + /* must be a spill/fill for the same reg */ + if (stx->src_reg != ldx->dst_reg) + break; + /* must be one of the previously unseen registers */ + if ((BIT(stx->src_reg) & expected_regs_mask) == 0) + break; + /* must be a spill/fill for the same expected offset, + * no need to check offset alignment, BPF_DW stack access + * is always 8-byte aligned. + */ + if (stx->off != off || ldx->off != off) + break; + expected_regs_mask &= ~BIT(stx->src_reg); + env->insn_aux_data[insn_idx - i].nocsr_pattern = 1; + env->insn_aux_data[insn_idx + i].nocsr_pattern = 1; + } + if (i == 1) + return; + + /* Conditionally set 'nocsr_spills_num' to allow forward + * compatibility when more helper functions are marked as + * nocsr at compile time than current kernel supports, e.g: + * + * 1: *(u64 *)(r10 - 8) = r1 + * 2: call A ;; assume A is nocsr for current kernel + * 3: r1 = *(u64 *)(r10 - 8) + * 4: *(u64 *)(r10 - 8) = r1 + * 5: call B ;; assume B is not nocsr for current kernel + * 6: r1 = *(u64 *)(r10 - 8) + * + * There is no need to block nocsr rewrite for such program. + * Set 'nocsr_pattern' for both calls to keep check_nocsr_stack_contract() happy, + * don't set 'nocsr_spills_num' for call B so that remove_nocsr_spills_fills() + * does not remove spill/fill pair {4,6}. + */ + if (can_be_inlined) + env->insn_aux_data[insn_idx].nocsr_spills_num = i - 1; + else + subprog->keep_nocsr_stack = 1; + subprog->nocsr_stack_off = min(subprog->nocsr_stack_off, off); +} + +static int mark_nocsr_patterns(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn *insn; + s16 lowest_off; + int s, i; + + for (s = 0; s < env->subprog_cnt; ++s, ++subprog) { + /* find lowest stack spill offset used in this subprog */ + lowest_off = 0; + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + insn = env->prog->insnsi + i; + if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) || + insn->dst_reg != BPF_REG_10) + continue; + lowest_off = min(lowest_off, insn->off); + } + /* use this offset to find nocsr patterns */ + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + insn = env->prog->insnsi + i; + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + mark_nocsr_pattern_for_call(env, subprog, i, lowest_off); + } + } + return 0; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -19209,9 +19474,11 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env) return 0; } +static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + static int opt_remove_nops(struct bpf_verifier_env *env) { - const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + const struct bpf_insn ja = NOP; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; int i, err; @@ -20957,6 +21224,40 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env) return 0; } +/* Remove unnecessary spill/fill pairs, members of nocsr pattern, + * adjust subprograms stack depth when possible. + */ +static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u32 spills_num; + bool modified = false; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (aux[i].nocsr_spills_num > 0) { + spills_num = aux[i].nocsr_spills_num; + /* NOPs would be removed by opt_remove_nops() */ + for (j = 1; j <= spills_num; ++j) { + *(insn - j) = NOP; + *(insn + j) = NOP; + } + modified = true; + } + if ((subprog + 1)->start == i + 1) { + if (modified && !subprog->keep_nocsr_stack) + subprog->stack_depth = -subprog->nocsr_stack_off; + subprog++; + modified = false; + } + } + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -21871,6 +22172,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = mark_nocsr_patterns(env); + if (ret < 0) + goto skip_full_check; + ret = do_check_main(env); ret = ret ?: do_check_subprogs(env); @@ -21880,6 +22185,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 skip_full_check: kvfree(env->explored_states); + /* might decrease stack depth, keep it before passes that + * allocate additional slots. + */ + if (ret == 0) + ret = remove_nocsr_spills_fills(env); + if (ret == 0) ret = check_max_stack_depth(env); From 63a9936b45859b24780c86c32a32fc7b087c304e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 08:34:44 -0700 Subject: [PATCH 0243/1052] selftests/bpf: Add tests for ldsx of pkt data/data_end/data_meta accesses The following tests are added to verifier_ldsx.c: - sign extension of data/data_end/data_meta for tcx programs. The actual checking is in bpf_skb_is_valid_access() which is called by sk_filter, cg_skb, lwt, tc(tcx) and sk_skb. - sign extension of data/data_end/data_meta for xdp programs. - sign extension of data/data_end for flow_dissector programs. All newly-added tests have verification failure with message "invalid bpf_context access". Without previous patch, all these tests succeeded verification. Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723153444.2430365-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/progs/verifier_ldsx.c | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index d4427d8e1217..52edee41caf6 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -144,6 +144,118 @@ __naked void ldsx_s32_range(void) : __clobber_all); } +SEC("xdp") +__description("LDSX, xdp s32 xdp_md->data") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_1(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[xdp_md_data]);" + "r0 = 0;" + "exit;" + : + : __imm_const(xdp_md_data, offsetof(struct xdp_md, data)) + : __clobber_all); +} + +SEC("xdp") +__description("LDSX, xdp s32 xdp_md->data_end") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_2(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[xdp_md_data_end]);" + "r0 = 0;" + "exit;" + : + : __imm_const(xdp_md_data_end, offsetof(struct xdp_md, data_end)) + : __clobber_all); +} + +SEC("xdp") +__description("LDSX, xdp s32 xdp_md->data_meta") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_3(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[xdp_md_data_meta]);" + "r0 = 0;" + "exit;" + : + : __imm_const(xdp_md_data_meta, offsetof(struct xdp_md, data_meta)) + : __clobber_all); +} + +SEC("tcx/ingress") +__description("LDSX, tcx s32 __sk_buff->data") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_4(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data, offsetof(struct __sk_buff, data)) + : __clobber_all); +} + +SEC("tcx/ingress") +__description("LDSX, tcx s32 __sk_buff->data_end") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_5(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data_end]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all); +} + +SEC("tcx/ingress") +__description("LDSX, tcx s32 __sk_buff->data_meta") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_6(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data_meta]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data_meta, offsetof(struct __sk_buff, data_meta)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("LDSX, flow_dissector s32 __sk_buff->data") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_7(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data, offsetof(struct __sk_buff, data)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("LDSX, flow_dissector s32 __sk_buff->data_end") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_8(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data_end]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all); +} + #else SEC("socket") From 91b7fbf3936f5c27d1673264dc24a713290e2165 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:37 -0700 Subject: [PATCH 0244/1052] bpf, x86, riscv, arm: no_caller_saved_registers for bpf_get_smp_processor_id() The function bpf_get_smp_processor_id() is processed in a different way, depending on the arch: - on x86 verifier replaces call to bpf_get_smp_processor_id() with a sequence of instructions that modify only r0; - on riscv64 jit replaces call to bpf_get_smp_processor_id() with a sequence of instructions that modify only r0; - on arm64 jit replaces call to bpf_get_smp_processor_id() with a sequence of instructions that modify only r0 and tmp registers. These rewrites satisfy attribute no_caller_saved_registers contract. Allow rewrite of no_caller_saved_registers patterns for bpf_get_smp_processor_id() in order to use this function as a canary for no_caller_saved_registers tests. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/helpers.c | 1 + kernel/bpf/verifier.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b5f0adae8293..d02ae323996b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -158,6 +158,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, + .allow_nocsr = true, }; BPF_CALL_0(bpf_get_numa_node_id) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7587336967cc..df3be12096cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16130,7 +16130,14 @@ static u32 helper_nocsr_clobber_mask(const struct bpf_func_proto *fn) */ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) { - return false; + switch (imm) { +#ifdef CONFIG_X86_64 + case BPF_FUNC_get_smp_processor_id: + return env->prog->jit_requested && bpf_jit_supports_percpu_insn(); +#endif + default: + return false; + } } /* GCC and LLVM define a no_caller_saved_registers function attribute. @@ -20834,7 +20841,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) /* Implement bpf_get_smp_processor_id() inline. */ if (insn->imm == BPF_FUNC_get_smp_processor_id && - prog->jit_requested && bpf_jit_supports_percpu_insn()) { + verifier_inlines_helper_call(env, insn->imm)) { /* BPF_FUNC_get_smp_processor_id inlining is an * optimization, so if pcpu_hot.cpu_number is ever * changed in some incompatible and hard to support From 9f5469b84577500375b311a98ffa8f4a68b5c77a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 09:29:33 -0700 Subject: [PATCH 0245/1052] bpf: Get better reg range with ldsx and 32bit compare With latest llvm19, the selftest iters/iter_arr_with_actual_elem_count failed with -mcpu=v4. The following are the details: 0: R1=ctx() R10=fp0 ; int iter_arr_with_actual_elem_count(const void *ctx) @ iters.c:1420 0: (b4) w7 = 0 ; R7_w=0 ; int i, n = loop_data.n, sum = 0; @ iters.c:1422 1: (18) r1 = 0xffffc90000191478 ; R1_w=map_value(map=iters.bss,ks=4,vs=1280,off=1144) 3: (61) r6 = *(u32 *)(r1 +128) ; R1_w=map_value(map=iters.bss,ks=4,vs=1280,off=1144) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) ; if (n > ARRAY_SIZE(loop_data.data)) @ iters.c:1424 4: (26) if w6 > 0x20 goto pc+27 ; R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=32,var_off=(0x0; 0x3f)) 5: (bf) r8 = r10 ; R8_w=fp0 R10=fp0 6: (07) r8 += -8 ; R8_w=fp-8 ; bpf_for(i, 0, n) { @ iters.c:1427 7: (bf) r1 = r8 ; R1_w=fp-8 R8_w=fp-8 8: (b4) w2 = 0 ; R2_w=0 9: (bc) w3 = w6 ; R3_w=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=32,var_off=(0x0; 0x3f)) R6_w=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=32,var_off=(0x0; 0x3f)) 10: (85) call bpf_iter_num_new#45179 ; R0=scalar() fp-8=iter_num(ref_id=2,state=active,depth=0) refs=2 11: (bf) r1 = r8 ; R1=fp-8 R8=fp-8 refs=2 12: (85) call bpf_iter_num_next#45181 13: R0=rdonly_mem(id=3,ref_obj_id=2,sz=4) R6=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=32,var_off=(0x0; 0x3f)) R7=0 R8=fp-8 R10=fp0 fp-8=iter_num(ref_id=2,state=active,depth=1) refs=2 ; bpf_for(i, 0, n) { @ iters.c:1427 13: (15) if r0 == 0x0 goto pc+2 ; R0=rdonly_mem(id=3,ref_obj_id=2,sz=4) refs=2 14: (81) r1 = *(s32 *)(r0 +0) ; R0=rdonly_mem(id=3,ref_obj_id=2,sz=4) R1_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff) refs=2 15: (ae) if w1 < w6 goto pc+4 20: R0=rdonly_mem(id=3,ref_obj_id=2,sz=4) R1=scalar(smin=0xffffffff80000000,smax=smax32=umax32=31,umax=0xffffffff0000001f,smin32=0,var_off=(0x0; 0xffffffff0000001f)) R6=scalar(id=1,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=32,var_off=(0x0; 0x3f)) R7=0 R8=fp-8 R10=fp0 fp-8=iter_num(ref_id=2,state=active,depth=1) refs=2 ; sum += loop_data.data[i]; @ iters.c:1429 20: (67) r1 <<= 2 ; R1_w=scalar(smax=0x7ffffffc0000007c,umax=0xfffffffc0000007c,smin32=0,smax32=umax32=124,var_off=(0x0; 0xfffffffc0000007c)) refs=2 21: (18) r2 = 0xffffc90000191478 ; R2_w=map_value(map=iters.bss,ks=4,vs=1280,off=1144) refs=2 23: (0f) r2 += r1 math between map_value pointer and register with unbounded min value is not allowed The source code: int iter_arr_with_actual_elem_count(const void *ctx) { int i, n = loop_data.n, sum = 0; if (n > ARRAY_SIZE(loop_data.data)) return 0; bpf_for(i, 0, n) { /* no rechecking of i against ARRAY_SIZE(loop_data.n) */ sum += loop_data.data[i]; } return sum; } The insn #14 is a sign-extenstion load which is related to 'int i'. The insn #15 did a subreg comparision. Note that smin=0xffffffff80000000 and this caused later insn #23 failed verification due to unbounded min value. Actually insn #15 R1 smin range can be better. Before insn #15, we have R1_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff) With the above range, we know for R1, upper 32bit can only be 0xffffffff or 0. Otherwise, the value range for R1 could be beyond [smin=0xffffffff80000000,smax=0x7fffffff]. After insn #15, for the true patch, we know smin32=0 and smax32=32. With the upper 32bit 0xffffffff, then the corresponding value is [0xffffffff00000000, 0xffffffff00000020]. The range is obviously beyond the original range [smin=0xffffffff80000000,smax=0x7fffffff] and the range is not possible. So the upper 32bit must be 0, which implies smin = smin32 and smax = smax32. This patch fixed the issue by adding additional register deduction after 32-bit compare insn. If the signed 32-bit register range is non-negative then 64-bit smin is in range of [S32_MIN, S32_MAX], then the actual 64-bit smin/smax should be the same as 32-bit smin32/smax32. With this patch, iters/iter_arr_with_actual_elem_count succeeded with better register range: from 15 to 20: R0=rdonly_mem(id=7,ref_obj_id=2,sz=4) R1_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=31,var_off=(0x0; 0x1f)) R6=scalar(id=1,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=32,var_off=(0x0; 0x3f)) R7=scalar(id=9,smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R8=scalar(id=9,smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R10=fp0 fp-8=iter_num(ref_id=2,state=active,depth=3) refs=2 Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723162933.2731620-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ab59db5837f..4759c0ba192b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2182,6 +2182,44 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) reg->smin_value = max_t(s64, reg->smin_value, new_smin); reg->smax_value = min_t(s64, reg->smax_value, new_smax); } + + /* Here we would like to handle a special case after sign extending load, + * when upper bits for a 64-bit range are all 1s or all 0s. + * + * Upper bits are all 1s when register is in a range: + * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff] + * Upper bits are all 0s when register is in a range: + * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff] + * Together this forms are continuous range: + * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff] + * + * Now, suppose that register range is in fact tighter: + * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R) + * Also suppose that it's 32-bit range is positive, + * meaning that lower 32-bits of the full 64-bit register + * are in the range: + * [0x0000_0000, 0x7fff_ffff] (W) + * + * If this happens, then any value in a range: + * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff] + * is smaller than a lowest bound of the range (R): + * 0xffff_ffff_8000_0000 + * which means that upper bits of the full 64-bit register + * can't be all 1s, when lower bits are in range (W). + * + * Note that: + * - 0xffff_ffff_8000_0000 == (s64)S32_MIN + * - 0x0000_0000_7fff_ffff == (s64)S32_MAX + * These relations are used in the conditions below. + */ + if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) { + reg->smin_value = reg->s32_min_value; + reg->smax_value = reg->s32_max_value; + reg->umin_value = reg->s32_min_value; + reg->umax_value = reg->s32_max_value; + reg->var_off = tnum_intersect(reg->var_off, + tnum_range(reg->smin_value, reg->smax_value)); + } } static void __reg_deduce_bounds(struct bpf_reg_state *reg) From 424ebaa3678b0d7f653dffb08eae90424740e0f4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:38 -0700 Subject: [PATCH 0246/1052] selftests/bpf: extract utility function for BPF disassembly struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz); Disassembles instruction 'insn' to a text buffer 'buf'. Removes insn->code hex prefix added by kernel disassembly routine. Returns a pointer to the next instruction (increments insn by either 1 or 2). Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/Makefile | 1 + tools/testing/selftests/bpf/disasm_helpers.c | 51 +++++++++++++ tools/testing/selftests/bpf/disasm_helpers.h | 12 +++ .../selftests/bpf/prog_tests/ctx_rewrite.c | 74 +++---------------- tools/testing/selftests/bpf/testing_helpers.c | 1 + 5 files changed, 75 insertions(+), 64 deletions(-) create mode 100644 tools/testing/selftests/bpf/disasm_helpers.c create mode 100644 tools/testing/selftests/bpf/disasm_helpers.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3318d5425d75..888ba68e6592 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -661,6 +661,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ test_loader.c \ xsk.c \ disasm.c \ + disasm_helpers.c \ json_writer.c \ flow_dissector_load.h \ ip_check_defrag_frags.h diff --git a/tools/testing/selftests/bpf/disasm_helpers.c b/tools/testing/selftests/bpf/disasm_helpers.c new file mode 100644 index 000000000000..96b1f2ffe438 --- /dev/null +++ b/tools/testing/selftests/bpf/disasm_helpers.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +#include +#include "disasm.h" + +struct print_insn_context { + char *buf; + size_t sz; +}; + +static void print_insn_cb(void *private_data, const char *fmt, ...) +{ + struct print_insn_context *ctx = private_data; + va_list args; + + va_start(args, fmt); + vsnprintf(ctx->buf, ctx->sz, fmt, args); + va_end(args); +} + +struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz) +{ + struct print_insn_context ctx = { + .buf = buf, + .sz = buf_sz, + }; + struct bpf_insn_cbs cbs = { + .cb_print = print_insn_cb, + .private_data = &ctx, + }; + char *tmp, *pfx_end, *sfx_start; + bool double_insn; + int len; + + print_bpf_insn(&cbs, insn, true); + /* We share code with kernel BPF disassembler, it adds '(FF) ' prefix + * for each instruction (FF stands for instruction `code` byte). + * Remove the prefix inplace, and also simplify call instructions. + * E.g.: "(85) call foo#10" -> "call foo". + * Also remove newline in the end (the 'max(strlen(buf) - 1, 0)' thing). + */ + pfx_end = buf + 5; + sfx_start = buf + max((int)strlen(buf) - 1, 0); + if (strncmp(pfx_end, "call ", 5) == 0 && (tmp = strrchr(buf, '#'))) + sfx_start = tmp; + len = sfx_start - pfx_end; + memmove(buf, pfx_end, len); + buf[len] = 0; + double_insn = insn->code == (BPF_LD | BPF_IMM | BPF_DW); + return insn + (double_insn ? 2 : 1); +} diff --git a/tools/testing/selftests/bpf/disasm_helpers.h b/tools/testing/selftests/bpf/disasm_helpers.h new file mode 100644 index 000000000000..7b26cab70099 --- /dev/null +++ b/tools/testing/selftests/bpf/disasm_helpers.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __DISASM_HELPERS_H +#define __DISASM_HELPERS_H + +#include + +struct bpf_insn; + +struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz); + +#endif /* __DISASM_HELPERS_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index 08b6391f2f56..dd75ccb03770 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -10,7 +10,8 @@ #include "bpf/btf.h" #include "bpf_util.h" #include "linux/filter.h" -#include "disasm.h" +#include "linux/kernel.h" +#include "disasm_helpers.h" #define MAX_PROG_TEXT_SZ (32 * 1024) @@ -628,63 +629,6 @@ static bool match_pattern(struct btf *btf, char *pattern, char *text, char *reg_ return false; } -static void print_insn(void *private_data, const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vfprintf((FILE *)private_data, fmt, args); - va_end(args); -} - -/* Disassemble instructions to a stream */ -static void print_xlated(FILE *out, struct bpf_insn *insn, __u32 len) -{ - const struct bpf_insn_cbs cbs = { - .cb_print = print_insn, - .cb_call = NULL, - .cb_imm = NULL, - .private_data = out, - }; - bool double_insn = false; - int i; - - for (i = 0; i < len; i++) { - if (double_insn) { - double_insn = false; - continue; - } - - double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); - print_bpf_insn(&cbs, insn + i, true); - } -} - -/* We share code with kernel BPF disassembler, it adds '(FF) ' prefix - * for each instruction (FF stands for instruction `code` byte). - * This function removes the prefix inplace for each line in `str`. - */ -static void remove_insn_prefix(char *str, int size) -{ - const int prefix_size = 5; - - int write_pos = 0, read_pos = prefix_size; - int len = strlen(str); - char c; - - size = min(size, len); - - while (read_pos < size) { - c = str[read_pos++]; - if (c == 0) - break; - str[write_pos++] = c; - if (c == '\n') - read_pos += prefix_size; - } - str[write_pos] = 0; -} - struct prog_info { char *prog_kind; enum bpf_prog_type prog_type; @@ -699,9 +643,10 @@ static void match_program(struct btf *btf, char *reg_map[][2], bool skip_first_insn) { - struct bpf_insn *buf = NULL; + struct bpf_insn *buf = NULL, *insn, *insn_end; int err = 0, prog_fd = 0; FILE *prog_out = NULL; + char insn_buf[64]; char *text = NULL; __u32 cnt = 0; @@ -739,12 +684,13 @@ static void match_program(struct btf *btf, PRINT_FAIL("Can't open memory stream\n"); goto out; } - if (skip_first_insn) - print_xlated(prog_out, buf + 1, cnt - 1); - else - print_xlated(prog_out, buf, cnt); + insn_end = buf + cnt; + insn = buf + (skip_first_insn ? 1 : 0); + while (insn < insn_end) { + insn = disasm_insn(insn, insn_buf, sizeof(insn_buf)); + fprintf(prog_out, "%s\n", insn_buf); + } fclose(prog_out); - remove_insn_prefix(text, MAX_PROG_TEXT_SZ); ASSERT_TRUE(match_pattern(btf, pattern, text, reg_map), pinfo->prog_kind); diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index d5379a0e6da8..ac7c66f4fc7b 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -7,6 +7,7 @@ #include #include #include +#include "disasm.h" #include "test_progs.h" #include "testing_helpers.h" #include From 26672b5caf3cbeeffe8651035dddff95f610e7ec Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 09:29:40 -0700 Subject: [PATCH 0247/1052] selftests/bpf: Add reg_bounds tests for ldsx and subreg compare Add a few reg_bounds selftests to test 32/16/8-bit ldsx and subreg comparison. Without the previous patch, all added tests will fail. Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723162940.2732171-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/reg_bounds.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index eb74363f9f70..0da4225749bd 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -433,6 +433,19 @@ static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, y_cast = range_cast(y_t, x_t, y); + /* If we know that + * - *x* is in the range of signed 32bit value, and + * - *y_cast* range is 32-bit signed non-negative + * then *x* range can be improved with *y_cast* such that *x* range + * is 32-bit signed non-negative. Otherwise, if the new range for *x* + * allows upper 32-bit * 0xffffffff then the eventual new range for + * *x* will be out of signed 32-bit range which violates the origin + * *x* range. + */ + if (x_t == S64 && y_t == S32 && y_cast.a <= S32_MAX && y_cast.b <= S32_MAX && + (s64)x.a >= S32_MIN && (s64)x.b <= S32_MAX) + return range_improve(x_t, x, y_cast); + /* the case when new range knowledge, *y*, is a 32-bit subregister * range, while previous range knowledge, *x*, is a full register * 64-bit range, needs special treatment to take into account upper 32 @@ -2108,6 +2121,9 @@ static struct subtest_case crafted_cases[] = { {S32, U32, {(u32)S32_MIN, 0}, {0, 0}}, {S32, U32, {(u32)S32_MIN, 0}, {(u32)S32_MIN, (u32)S32_MIN}}, {S32, U32, {(u32)S32_MIN, S32_MAX}, {S32_MAX, S32_MAX}}, + {S64, U32, {0x0, 0x1f}, {0xffffffff80000000ULL, 0x000000007fffffffULL}}, + {S64, U32, {0x0, 0x1f}, {0xffffffffffff8000ULL, 0x0000000000007fffULL}}, + {S64, U32, {0x0, 0x1f}, {0xffffffffffffff80ULL, 0x000000000000007fULL}}, }; /* Go over crafted hard-coded cases. This is fast, so we do it as part of From 203e6aba7692bca18fd97251b1354da0f5e2ba30 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:39 -0700 Subject: [PATCH 0248/1052] selftests/bpf: print correct offset for pseudo calls in disasm_insn() Adjust disasm_helpers.c:disasm_insn() to account for the following part of the verifier.c:jit_subprogs: for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { /* ... */ if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); insn->imm = subprog; } Where verifier moves offset of the subprogram to the insn->off field. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/disasm_helpers.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/bpf/disasm_helpers.c b/tools/testing/selftests/bpf/disasm_helpers.c index 96b1f2ffe438..f529f1c8c171 100644 --- a/tools/testing/selftests/bpf/disasm_helpers.c +++ b/tools/testing/selftests/bpf/disasm_helpers.c @@ -4,6 +4,7 @@ #include "disasm.h" struct print_insn_context { + char scratch[16]; char *buf; size_t sz; }; @@ -18,6 +19,22 @@ static void print_insn_cb(void *private_data, const char *fmt, ...) va_end(args); } +static const char *print_call_cb(void *private_data, const struct bpf_insn *insn) +{ + struct print_insn_context *ctx = private_data; + + /* For pseudo calls verifier.c:jit_subprogs() hides original + * imm to insn->off and changes insn->imm to be an index of + * the subprog instead. + */ + if (insn->src_reg == BPF_PSEUDO_CALL) { + snprintf(ctx->scratch, sizeof(ctx->scratch), "%+d", insn->off); + return ctx->scratch; + } + + return NULL; +} + struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz) { struct print_insn_context ctx = { @@ -26,6 +43,7 @@ struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz) }; struct bpf_insn_cbs cbs = { .cb_print = print_insn_cb, + .cb_call = print_call_cb, .private_data = &ctx, }; char *tmp, *pfx_end, *sfx_start; From 4ef5d6af493558124b7a6c13cace58b938fe27d4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:40 -0700 Subject: [PATCH 0249/1052] selftests/bpf: no need to track next_match_pos in struct test_loader The call stack for validate_case() function looks as follows: - test_loader__run_subtests() - process_subtest() - run_subtest() - prepare_case(), which does 'tester->next_match_pos = 0'; - validate_case(), which increments tester->next_match_pos. Hence, each subtest is run with next_match_pos freshly set to zero. Meaning that there is no need to persist this variable in the struct test_loader, use local variable instead. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/test_loader.c | 19 ++++++++----------- tools/testing/selftests/bpf/test_progs.h | 1 - 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index f14e10b0de96..47508cf66e89 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -434,7 +434,6 @@ static void prepare_case(struct test_loader *tester, bpf_program__set_flags(prog, prog_flags | spec->prog_flags); tester->log_buf[0] = '\0'; - tester->next_match_pos = 0; } static void emit_verifier_log(const char *log_buf, bool force) @@ -450,25 +449,23 @@ static void validate_case(struct test_loader *tester, struct bpf_program *prog, int load_err) { - int i, j, err; - char *match; regmatch_t reg_match[1]; + const char *log = tester->log_buf; + int i, j, err; for (i = 0; i < subspec->expect_msg_cnt; i++) { struct expect_msg *msg = &subspec->expect_msgs[i]; + const char *match = NULL; if (msg->substr) { - match = strstr(tester->log_buf + tester->next_match_pos, msg->substr); + match = strstr(log, msg->substr); if (match) - tester->next_match_pos = match - tester->log_buf + strlen(msg->substr); + log += strlen(msg->substr); } else { - err = regexec(&msg->regex, - tester->log_buf + tester->next_match_pos, 1, reg_match, 0); + err = regexec(&msg->regex, log, 1, reg_match, 0); if (err == 0) { - match = tester->log_buf + tester->next_match_pos + reg_match[0].rm_so; - tester->next_match_pos += reg_match[0].rm_eo; - } else { - match = NULL; + match = log + reg_match[0].rm_so; + log += reg_match[0].rm_eo; } } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 51341d50213b..b1e949fb16cf 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -447,7 +447,6 @@ typedef int (*pre_execution_cb)(struct bpf_object *obj); struct test_loader { char *log_buf; size_t log_buf_sz; - size_t next_match_pos; pre_execution_cb pre_execution_cb; struct bpf_object *obj; From 64f01e935ddb26f48baec71883c27878ac4231dc Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:41 -0700 Subject: [PATCH 0250/1052] selftests/bpf: extract test_loader->expect_msgs as a data structure Non-functional change: use a separate data structure to represented expected messages in test_loader. This would allow to use the same functionality for expected set of disassembled instructions in the follow-up commit. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/test_loader.c | 81 ++++++++++++----------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 47508cf66e89..3f84903558dd 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -55,11 +55,15 @@ struct expect_msg { regex_t regex; }; +struct expected_msgs { + struct expect_msg *patterns; + size_t cnt; +}; + struct test_subspec { char *name; bool expect_failure; - struct expect_msg *expect_msgs; - size_t expect_msg_cnt; + struct expected_msgs expect_msgs; int retval; bool execute; }; @@ -96,44 +100,45 @@ void test_loader_fini(struct test_loader *tester) free(tester->log_buf); } -static void free_test_spec(struct test_spec *spec) +static void free_msgs(struct expected_msgs *msgs) { int i; + for (i = 0; i < msgs->cnt; i++) + if (msgs->patterns[i].regex_str) + regfree(&msgs->patterns[i].regex); + free(msgs->patterns); + msgs->patterns = NULL; + msgs->cnt = 0; +} + +static void free_test_spec(struct test_spec *spec) +{ /* Deallocate expect_msgs arrays. */ - for (i = 0; i < spec->priv.expect_msg_cnt; i++) - if (spec->priv.expect_msgs[i].regex_str) - regfree(&spec->priv.expect_msgs[i].regex); - for (i = 0; i < spec->unpriv.expect_msg_cnt; i++) - if (spec->unpriv.expect_msgs[i].regex_str) - regfree(&spec->unpriv.expect_msgs[i].regex); + free_msgs(&spec->priv.expect_msgs); + free_msgs(&spec->unpriv.expect_msgs); free(spec->priv.name); free(spec->unpriv.name); - free(spec->priv.expect_msgs); - free(spec->unpriv.expect_msgs); - spec->priv.name = NULL; spec->unpriv.name = NULL; - spec->priv.expect_msgs = NULL; - spec->unpriv.expect_msgs = NULL; } -static int push_msg(const char *substr, const char *regex_str, struct test_subspec *subspec) +static int push_msg(const char *substr, const char *regex_str, struct expected_msgs *msgs) { void *tmp; int regcomp_res; char error_msg[100]; struct expect_msg *msg; - tmp = realloc(subspec->expect_msgs, - (1 + subspec->expect_msg_cnt) * sizeof(struct expect_msg)); + tmp = realloc(msgs->patterns, + (1 + msgs->cnt) * sizeof(struct expect_msg)); if (!tmp) { ASSERT_FAIL("failed to realloc memory for messages\n"); return -ENOMEM; } - subspec->expect_msgs = tmp; - msg = &subspec->expect_msgs[subspec->expect_msg_cnt]; + msgs->patterns = tmp; + msg = &msgs->patterns[msgs->cnt]; if (substr) { msg->substr = substr; @@ -150,7 +155,7 @@ static int push_msg(const char *substr, const char *regex_str, struct test_subsp } } - subspec->expect_msg_cnt += 1; + msgs->cnt += 1; return 0; } @@ -272,25 +277,25 @@ static int parse_test_spec(struct test_loader *tester, spec->mode_mask |= UNPRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) { msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1; - err = push_msg(msg, NULL, &spec->priv); + err = push_msg(msg, NULL, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV)) { msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX_UNPRIV) - 1; - err = push_msg(msg, NULL, &spec->unpriv); + err = push_msg(msg, NULL, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX)) { msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX) - 1; - err = push_msg(NULL, msg, &spec->priv); + err = push_msg(NULL, msg, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV)) { msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX_UNPRIV) - 1; - err = push_msg(NULL, msg, &spec->unpriv); + err = push_msg(NULL, msg, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; @@ -387,11 +392,12 @@ static int parse_test_spec(struct test_loader *tester, spec->unpriv.execute = spec->priv.execute; } - if (!spec->unpriv.expect_msgs) { - for (i = 0; i < spec->priv.expect_msg_cnt; i++) { - struct expect_msg *msg = &spec->priv.expect_msgs[i]; + if (spec->unpriv.expect_msgs.cnt == 0) { + for (i = 0; i < spec->priv.expect_msgs.cnt; i++) { + struct expect_msg *msg = &spec->priv.expect_msgs.patterns[i]; - err = push_msg(msg->substr, msg->regex_str, &spec->unpriv); + err = push_msg(msg->substr, msg->regex_str, + &spec->unpriv.expect_msgs); if (err) goto cleanup; } @@ -443,18 +449,14 @@ static void emit_verifier_log(const char *log_buf, bool force) fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf); } -static void validate_case(struct test_loader *tester, - struct test_subspec *subspec, - struct bpf_object *obj, - struct bpf_program *prog, - int load_err) +static void validate_msgs(char *log_buf, struct expected_msgs *msgs) { regmatch_t reg_match[1]; - const char *log = tester->log_buf; + const char *log = log_buf; int i, j, err; - for (i = 0; i < subspec->expect_msg_cnt; i++) { - struct expect_msg *msg = &subspec->expect_msgs[i]; + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; const char *match = NULL; if (msg->substr) { @@ -471,9 +473,9 @@ static void validate_case(struct test_loader *tester, if (!ASSERT_OK_PTR(match, "expect_msg")) { if (env.verbosity == VERBOSE_NONE) - emit_verifier_log(tester->log_buf, true /*force*/); + emit_verifier_log(log_buf, true /*force*/); for (j = 0; j <= i; j++) { - msg = &subspec->expect_msgs[j]; + msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", j < i ? "MATCHED " : "EXPECTED", msg->substr ? "SUBSTR" : " REGEX", @@ -692,9 +694,8 @@ void run_subtest(struct test_loader *tester, goto tobj_cleanup; } } - emit_verifier_log(tester->log_buf, false /*force*/); - validate_case(tester, subspec, tobj, tprog, err); + validate_msgs(tester->log_buf, &subspec->expect_msgs); if (should_do_test_run(spec, subspec)) { /* For some reason test_verifier executes programs From 9c9f7339131030949a8ef111080427ff1a8085b5 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:42 -0700 Subject: [PATCH 0251/1052] selftests/bpf: allow checking xlated programs in verifier_* tests Add a macro __xlated("...") for use with test_loader tests. When such annotations are present for the test case: - bpf_prog_get_info_by_fd() is used to get BPF program after all rewrites are applied by verifier. - the program is disassembled and patterns specified in __xlated are searched for in the disassembly text. __xlated matching follows the same mechanics as __msg: each subsequent pattern is matched from the point where previous pattern ended. This allows to write tests like below, where the goal is to verify the behavior of one of the of the transformations applied by verifier: SEC("raw_tp") __xlated("1: w0 = ") __xlated("2: r0 = &(void __percpu *)(r0)") __xlated("3: r0 = *(u32 *)(r0 +0)") __xlated("4: exit") __success __naked void simple(void) { asm volatile ( "call %[bpf_get_smp_processor_id];" "exit;" : : __imm(bpf_get_smp_processor_id) : __clobber_all); } Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/bpf_misc.h | 5 ++ tools/testing/selftests/bpf/test_loader.c | 82 +++++++++++++++++++- 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 81097a3f15eb..a70939c7bc26 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -26,6 +26,9 @@ * * __regex Same as __msg, but using a regular expression. * __regex_unpriv Same as __msg_unpriv but using a regular expression. + * __xlated Expect a line in a disassembly log after verifier applies rewrites. + * Multiple __xlated attributes could be specified. + * __xlated_unpriv Same as __xlated but for unprivileged mode. * * __success Expect program load success in privileged mode. * __success_unpriv Expect program load success in unprivileged mode. @@ -63,11 +66,13 @@ */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) #define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" regex))) +#define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" msg))) #define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" regex))) +#define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) #define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 3f84903558dd..b44b6a2fc82c 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -7,6 +7,7 @@ #include #include "autoconf_helper.h" +#include "disasm_helpers.h" #include "unpriv_helpers.h" #include "cap_helpers.h" @@ -19,10 +20,12 @@ #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" #define TEST_TAG_EXPECT_REGEX_PFX "comment:test_expect_regex=" +#define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated=" #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv" #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv" #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv=" #define TEST_TAG_EXPECT_REGEX_PFX_UNPRIV "comment:test_expect_regex_unpriv=" +#define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv=" #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags=" #define TEST_TAG_DESCRIPTION_PFX "comment:test_description=" @@ -64,6 +67,7 @@ struct test_subspec { char *name; bool expect_failure; struct expected_msgs expect_msgs; + struct expected_msgs expect_xlated; int retval; bool execute; }; @@ -117,6 +121,8 @@ static void free_test_spec(struct test_spec *spec) /* Deallocate expect_msgs arrays. */ free_msgs(&spec->priv.expect_msgs); free_msgs(&spec->unpriv.expect_msgs); + free_msgs(&spec->priv.expect_xlated); + free_msgs(&spec->unpriv.expect_xlated); free(spec->priv.name); free(spec->unpriv.name); @@ -299,6 +305,18 @@ static int parse_test_spec(struct test_loader *tester, if (err) goto cleanup; spec->mode_mask |= UNPRIV; + } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX)) { + msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX) - 1; + err = push_msg(msg, NULL, &spec->priv.expect_xlated); + if (err) + goto cleanup; + spec->mode_mask |= PRIV; + } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV)) { + msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX_UNPRIV) - 1; + err = push_msg(msg, NULL, &spec->unpriv.expect_xlated); + if (err) + goto cleanup; + spec->mode_mask |= UNPRIV; } else if (str_has_pfx(s, TEST_TAG_RETVAL_PFX)) { val = s + sizeof(TEST_TAG_RETVAL_PFX) - 1; err = parse_retval(val, &spec->priv.retval, "__retval"); @@ -402,6 +420,16 @@ static int parse_test_spec(struct test_loader *tester, goto cleanup; } } + if (spec->unpriv.expect_xlated.cnt == 0) { + for (i = 0; i < spec->priv.expect_xlated.cnt; i++) { + struct expect_msg *msg = &spec->priv.expect_xlated.patterns[i]; + + err = push_msg(msg->substr, msg->regex_str, + &spec->unpriv.expect_xlated); + if (err) + goto cleanup; + } + } } spec->valid = true; @@ -449,7 +477,15 @@ static void emit_verifier_log(const char *log_buf, bool force) fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf); } -static void validate_msgs(char *log_buf, struct expected_msgs *msgs) +static void emit_xlated(const char *xlated, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "XLATED:\n=============\n%s=============\n", xlated); +} + +static void validate_msgs(char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)) { regmatch_t reg_match[1]; const char *log = log_buf; @@ -473,7 +509,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs) if (!ASSERT_OK_PTR(match, "expect_msg")) { if (env.verbosity == VERBOSE_NONE) - emit_verifier_log(log_buf, true /*force*/); + emit_fn(log_buf, true /*force*/); for (j = 0; j <= i; j++) { msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", @@ -610,6 +646,37 @@ static bool should_do_test_run(struct test_spec *spec, struct test_subspec *subs return true; } +/* Get a disassembly of BPF program after verifier applies all rewrites */ +static int get_xlated_program_text(int prog_fd, char *text, size_t text_sz) +{ + struct bpf_insn *insn_start = NULL, *insn, *insn_end; + __u32 insns_cnt = 0, i; + char buf[64]; + FILE *out = NULL; + int err; + + err = get_xlated_program(prog_fd, &insn_start, &insns_cnt); + if (!ASSERT_OK(err, "get_xlated_program")) + goto out; + out = fmemopen(text, text_sz, "w"); + if (!ASSERT_OK_PTR(out, "open_memstream")) + goto out; + insn_end = insn_start + insns_cnt; + insn = insn_start; + while (insn < insn_end) { + i = insn - insn_start; + insn = disasm_insn(insn, buf, sizeof(buf)); + fprintf(out, "%d: %s\n", i, buf); + } + fflush(out); + +out: + free(insn_start); + if (out) + fclose(out); + return err; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -695,7 +762,16 @@ void run_subtest(struct test_loader *tester, } } emit_verifier_log(tester->log_buf, false /*force*/); - validate_msgs(tester->log_buf, &subspec->expect_msgs); + validate_msgs(tester->log_buf, &subspec->expect_msgs, emit_verifier_log); + + if (subspec->expect_xlated.cnt) { + err = get_xlated_program_text(bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err) + goto tobj_cleanup; + emit_xlated(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->expect_xlated, emit_xlated); + } if (should_do_test_run(spec, subspec)) { /* For some reason test_verifier executes programs From ee7fe84468b1732fe65c5af3836437d54ac4c419 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:43 -0700 Subject: [PATCH 0252/1052] selftests/bpf: __arch_* macro to limit test cases to specific archs Add annotations __arch_x86_64, __arch_arm64, __arch_riscv64 to specify on which architecture the test case should be tested. Several __arch_* annotations could be specified at once. When test case is not run on current arch it is marked as skipped. For example, the following would be tested only on arm64 and riscv64: SEC("raw_tp") __arch_arm64 __arch_riscv64 __xlated("1: *(u64 *)(r10 - 16) = r1") __xlated("2: call") __xlated("3: r1 = *(u64 *)(r10 - 16);") __success __naked void canary_arm64_riscv64(void) { asm volatile ( "r1 = 1;" "*(u64 *)(r10 - 16) = r1;" "call %[bpf_get_smp_processor_id];" "r1 = *(u64 *)(r10 - 16);" "exit;" : : __imm(bpf_get_smp_processor_id) : __clobber_all); } On x86 it would be skipped: #467/2 verifier_nocsr/canary_arm64_riscv64:SKIP Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/bpf_misc.h | 8 ++++ tools/testing/selftests/bpf/test_loader.c | 43 ++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a70939c7bc26..a225cd87897c 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -63,6 +63,10 @@ * __auxiliary Annotated program is not a separate test, but used as auxiliary * for some other test cases and should always be loaded. * __auxiliary_unpriv Same, but load program in unprivileged mode. + * + * __arch_* Specify on which architecture the test case should be tested. + * Several __arch_* annotations could be specified at once. + * When test case is not run on current arch it is marked as skipped. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) #define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" regex))) @@ -82,6 +86,10 @@ #define __auxiliary __attribute__((btf_decl_tag("comment:test_auxiliary"))) #define __auxiliary_unpriv __attribute__((btf_decl_tag("comment:test_auxiliary_unpriv"))) #define __btf_path(path) __attribute__((btf_decl_tag("comment:test_btf_path=" path))) +#define __arch(arch) __attribute__((btf_decl_tag("comment:test_arch=" arch))) +#define __arch_x86_64 __arch("X86_64") +#define __arch_arm64 __arch("ARM64") +#define __arch_riscv64 __arch("RISCV64") /* Convenience macro for use with 'asm volatile' blocks */ #define __naked __attribute__((naked)) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index b44b6a2fc82c..12b0c41e8d64 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -34,6 +34,7 @@ #define TEST_TAG_AUXILIARY "comment:test_auxiliary" #define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv" #define TEST_BTF_PATH "comment:test_btf_path=" +#define TEST_TAG_ARCH "comment:test_arch=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xcafe4all @@ -80,6 +81,7 @@ struct test_spec { int log_level; int prog_flags; int mode_mask; + int arch_mask; bool auxiliary; bool valid; }; @@ -213,6 +215,12 @@ static void update_flags(int *flags, int flag, bool clear) *flags |= flag; } +enum arch { + ARCH_X86_64 = 0x1, + ARCH_ARM64 = 0x2, + ARCH_RISCV64 = 0x4, +}; + /* Uses btf_decl_tag attributes to describe the expected test * behavior, see bpf_misc.h for detailed description of each attribute * and attribute combinations. @@ -226,6 +234,7 @@ static int parse_test_spec(struct test_loader *tester, bool has_unpriv_result = false; bool has_unpriv_retval = false; int func_id, i, err = 0; + u32 arch_mask = 0; struct btf *btf; memset(spec, 0, sizeof(*spec)); @@ -364,11 +373,26 @@ static int parse_test_spec(struct test_loader *tester, goto cleanup; update_flags(&spec->prog_flags, flags, clear); } + } else if (str_has_pfx(s, TEST_TAG_ARCH)) { + val = s + sizeof(TEST_TAG_ARCH) - 1; + if (strcmp(val, "X86_64") == 0) { + arch_mask |= ARCH_X86_64; + } else if (strcmp(val, "ARM64") == 0) { + arch_mask |= ARCH_ARM64; + } else if (strcmp(val, "RISCV64") == 0) { + arch_mask |= ARCH_RISCV64; + } else { + PRINT_FAIL("bad arch spec: '%s'", val); + err = -EINVAL; + goto cleanup; + } } else if (str_has_pfx(s, TEST_BTF_PATH)) { spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1; } } + spec->arch_mask = arch_mask; + if (spec->mode_mask == 0) spec->mode_mask = PRIV; @@ -677,6 +701,20 @@ static int get_xlated_program_text(int prog_fd, char *text, size_t text_sz) return err; } +static bool run_on_current_arch(int arch_mask) +{ + if (arch_mask == 0) + return true; +#if defined(__x86_64__) + return arch_mask & ARCH_X86_64; +#elif defined(__aarch64__) + return arch_mask & ARCH_ARM64; +#elif defined(__riscv) && __riscv_xlen == 64 + return arch_mask & ARCH_RISCV64; +#endif + return false; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -701,6 +739,11 @@ void run_subtest(struct test_loader *tester, if (!test__start_subtest(subspec->name)) return; + if (!run_on_current_arch(spec->arch_mask)) { + test__skip(); + return; + } + if (unpriv) { if (!can_execute_unpriv(tester, spec)) { test__skip(); From d0ad1f8f8846cffebca55abdd1ed275e276a6754 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:44 -0700 Subject: [PATCH 0253/1052] selftests/bpf: test no_caller_saved_registers spill/fill removal Tests for no_caller_saved_registers processing logic (see verifier.c:match_and_mark_nocsr_pattern()): - a canary positive test case; - a canary test case for arm64 and riscv64; - various tests with broken patterns; - tests with read/write fixed/varying stack access that violate nocsr stack access contract; - tests with multiple subprograms; - tests using nocsr in combination with may_goto/bpf_loop, as all of these features affect stack depth; - tests for nocsr stack spills below max stack depth. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-11-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_nocsr.c | 796 ++++++++++++++++++ 2 files changed, 798 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_nocsr.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index ff1c7da1d06e..67a49d12472c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -53,6 +53,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" +#include "verifier_nocsr.skel.h" #include "verifier_or_jmp32_k.skel.h" #include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" @@ -173,6 +174,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } +void test_verifier_nocsr(void) { RUN(verifier_nocsr); } void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); } void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c new file mode 100644 index 000000000000..a7fe277e5167 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_nocsr.c @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 8") +__xlated("4: r5 = 5") +__xlated("5: w0 = ") +__xlated("6: r0 = &(void __percpu *)(r0)") +__xlated("7: r0 = *(u32 *)(r0 +0)") +__xlated("8: exit") +__success +__naked void simple(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r10 - 16) = r1;" + "*(u64 *)(r10 - 24) = r2;" + "*(u64 *)(r10 - 32) = r3;" + "*(u64 *)(r10 - 40) = r4;" + "*(u64 *)(r10 - 48) = r5;" + "call %[bpf_get_smp_processor_id];" + "r5 = *(u64 *)(r10 - 48);" + "r4 = *(u64 *)(r10 - 40);" + "r3 = *(u64 *)(r10 - 32);" + "r2 = *(u64 *)(r10 - 24);" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +/* The logic for detecting and verifying nocsr pattern is the same for + * any arch, however x86 differs from arm64 or riscv64 in a way + * bpf_get_smp_processor_id is rewritten: + * - on x86 it is done by verifier + * - on arm64 and riscv64 it is done by jit + * + * Which leads to different xlated patterns for different archs: + * - on x86 the call is expanded as 3 instructions + * - on arm64 and riscv64 the call remains as is + * (but spills/fills are still removed) + * + * It is really desirable to check instruction indexes in the xlated + * patterns, so add this canary test to check that function rewrite by + * jit is correctly processed by nocsr logic, keep the rest of the + * tests as x86. + */ +SEC("raw_tp") +__arch_arm64 +__arch_riscv64 +__xlated("0: r1 = 1") +__xlated("1: call bpf_get_smp_processor_id") +__xlated("2: exit") +__success +__naked void canary_arm64_riscv64(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: r0 = &(void __percpu *)(r0)") +__xlated("3: exit") +__success +__naked void canary_zero_spills(void) +{ + asm volatile ( + "call %[bpf_get_smp_processor_id];" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 16") +__xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r2 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern1(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r6") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r6 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern2(void) +{ + asm volatile ( + "r6 = 1;" + "*(u64 *)(r10 - 16) = r6;" + "call %[bpf_get_smp_processor_id];" + "r6 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r0") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern3(void) +{ + asm volatile ( + "r0 = 1;" + "*(u64 *)(r10 - 16) = r0;" + "call %[bpf_get_smp_processor_id];" + "r0 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u64 *)(r2 -16) = r1") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("6: r1 = *(u64 *)(r10 -16)") +__success +__naked void wrong_base_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = r10;" + "*(u64 *)(r2 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r2 = 1") +__success +__naked void wrong_insn_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r2 = 1;" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u64 *)(r10 -16) = r1") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("6: r1 = *(u64 *)(r10 -8)") +__success +__naked void wrong_off_in_pattern1(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u32 *)(r10 -4) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u32 *)(r10 -4)") +__success +__naked void wrong_off_in_pattern2(void) +{ + asm volatile ( + "r1 = 1;" + "*(u32 *)(r10 - 4) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u32 *)(r10 - 4);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u32 *)(r10 -16) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u32 *)(r10 -16)") +__success +__naked void wrong_size_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "*(u32 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u32 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u32 *)(r10 -8) = r1") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("6: r1 = *(u32 *)(r10 -8)") +__success +__naked void partial_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "*(u32 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "r1 = *(u32 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("0: r1 = 1") +__xlated("1: r2 = 2") +/* not patched, spills for -8, -16 not removed */ +__xlated("2: *(u64 *)(r10 -8) = r1") +__xlated("3: *(u64 *)(r10 -16) = r2") +__xlated("5: r0 = &(void __percpu *)(r0)") +__xlated("7: r2 = *(u64 *)(r10 -16)") +__xlated("8: r1 = *(u64 *)(r10 -8)") +/* patched, spills for -24, -32 removed */ +__xlated("10: r0 = &(void __percpu *)(r0)") +__xlated("12: exit") +__success +__naked void min_stack_offset(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + /* this call won't be patched */ + "*(u64 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "r1 = *(u64 *)(r10 - 8);" + /* this call would be patched */ + "*(u64 *)(r10 - 24) = r1;" + "*(u64 *)(r10 - 32) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 32);" + "r1 = *(u64 *)(r10 - 24);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_fixed_read(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r1 = *(u64 *)(r1 - 0);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_fixed_write(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r1 - 0) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("10: r1 = *(u64 *)(r10 -16)") +__success +__naked void bad_varying_read(void) +{ + asm volatile ( + "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ + "r6 &= 0x7;" /* r6 range [0..7] */ + "r6 += 0x2;" /* r6 range [2..9] */ + "r7 = 0;" + "r7 -= r6;" /* r7 range [-9..-2] */ + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r1 = r10;" + "r1 += r7;" + "r1 = *(u8 *)(r1 - 0);" /* touches slot [-16..-9] where spills are stored */ + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("10: r1 = *(u64 *)(r10 -16)") +__success +__naked void bad_varying_write(void) +{ + asm volatile ( + "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ + "r6 &= 0x7;" /* r6 range [0..7] */ + "r6 += 0x2;" /* r6 range [2..9] */ + "r7 = 0;" + "r7 -= r6;" /* r7 range [-9..-2] */ + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r1 = r10;" + "r1 += r7;" + "*(u8 *)(r1 - 0) = r7;" /* touches slot [-16..-9] where spills are stored */ + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_write_in_subprog(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "call bad_write_in_subprog_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void bad_write_in_subprog_aux(void) +{ + asm volatile ( + "r0 = 1;" + "*(u64 *)(r1 - 0) = r0;" /* invalidates nocsr contract for caller: */ + "exit;" /* caller stack at -8 used outside of the pattern */ + ::: __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_helper_write(void) +{ + asm volatile ( + "r1 = 1;" + /* nocsr pattern with stack offset -8 */ + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r2 = 1;" + "r3 = 42;" + /* read dst is fp[-8], thus nocsr rewrite not applied */ + "call %[bpf_probe_read_kernel];" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm(bpf_probe_read_kernel) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +/* main, not patched */ +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__xlated("9: call pc+1") +__xlated("10: exit") +/* subprogram, patched */ +__xlated("11: r1 = 1") +__xlated("13: r0 = &(void __percpu *)(r0)") +__xlated("15: exit") +__success +__naked void invalidate_one_subprog(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r1 = *(u64 *)(r1 - 0);" + "call invalidate_one_subprog_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void invalidate_one_subprog_aux(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +/* main */ +__xlated("0: r1 = 1") +__xlated("2: r0 = &(void __percpu *)(r0)") +__xlated("4: call pc+1") +__xlated("5: exit") +/* subprogram */ +__xlated("6: r1 = 1") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("10: *(u64 *)(r10 -16) = r1") +__xlated("11: exit") +__success +__naked void subprogs_use_independent_offsets(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "call subprogs_use_independent_offsets_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void subprogs_use_independent_offsets_aux(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 24) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 24);" + "*(u64 *)(r10 - 16) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 8") +__xlated("2: r0 = &(void __percpu *)(r0)") +__success +__naked void helper_call_does_not_prevent_nocsr(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 16") +/* may_goto counter at -16 */ +__xlated("0: *(u64 *)(r10 -16) =") +__xlated("1: r1 = 1") +__xlated("3: r0 = &(void __percpu *)(r0)") +/* may_goto expansion starts */ +__xlated("5: r11 = *(u64 *)(r10 -16)") +__xlated("6: if r11 == 0x0 goto pc+3") +__xlated("7: r11 -= 1") +__xlated("8: *(u64 *)(r10 -16) = r11") +/* may_goto expansion ends */ +__xlated("9: *(u64 *)(r10 -8) = r1") +__xlated("10: exit") +__success +__naked void may_goto_interaction(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + ".8byte %[may_goto];" + /* just touch some stack at -8 */ + "*(u64 *)(r10 - 8) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) + : __clobber_all); +} + +__used +__naked static void dummy_loop_callback(void) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 32+0") +__xlated("2: r1 = 1") +__xlated("3: w0 =") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u32 *)(r0 +0)") +/* bpf_loop params setup */ +__xlated("6: r2 =") +__xlated("7: r3 = 0") +__xlated("8: r4 = 0") +/* ... part of the inlined bpf_loop */ +__xlated("12: *(u64 *)(r10 -32) = r6") +__xlated("13: *(u64 *)(r10 -24) = r7") +__xlated("14: *(u64 *)(r10 -16) = r8") +/* ... */ +__xlated("21: call pc+8") /* dummy_loop_callback */ +/* ... last insns of the bpf_loop_interaction1 */ +__xlated("28: r0 = 0") +__xlated("29: exit") +/* dummy_loop_callback */ +__xlated("30: r0 = 0") +__xlated("31: exit") +__success +__naked int bpf_loop_interaction1(void) +{ + asm volatile ( + "r1 = 1;" + /* nocsr stack region at -16, but could be removed */ + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r2 = %[dummy_loop_callback];" + "r3 = 0;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm_ptr(dummy_loop_callback), + __imm(bpf_get_smp_processor_id), + __imm(bpf_loop) + : __clobber_common + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 40+0") +/* call bpf_get_smp_processor_id */ +__xlated("2: r1 = 42") +__xlated("3: w0 =") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u32 *)(r0 +0)") +/* call bpf_get_prandom_u32 */ +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("7: call") +__xlated("8: r1 = *(u64 *)(r10 -16)") +/* ... */ +/* ... part of the inlined bpf_loop */ +__xlated("15: *(u64 *)(r10 -40) = r6") +__xlated("16: *(u64 *)(r10 -32) = r7") +__xlated("17: *(u64 *)(r10 -24) = r8") +__success +__naked int bpf_loop_interaction2(void) +{ + asm volatile ( + "r1 = 42;" + /* nocsr stack region at -16, cannot be removed */ + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - 16);" + "r2 = %[dummy_loop_callback];" + "r3 = 0;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm_ptr(dummy_loop_callback), + __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32), + __imm(bpf_loop) + : __clobber_common + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 512+0") +/* just to print xlated version when debugging */ +__xlated("r0 = &(void __percpu *)(r0)") +__success +/* cumulative_stack_depth() stack usage is MAX_BPF_STACK, + * called subprogram uses an additional slot for nocsr spill/fill, + * since nocsr spill/fill could be removed the program still fits + * in MAX_BPF_STACK and should be accepted. + */ +__naked int cumulative_stack_depth(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "call cumulative_stack_depth_subprog;" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK) + : __clobber_all + ); +} + +__used +__naked static void cumulative_stack_depth_subprog(void) +{ + asm volatile ( + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + :: __imm(bpf_get_smp_processor_id) : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 512") +__xlated("0: r1 = 42") +__xlated("1: *(u64 *)(r10 -512) = r1") +__xlated("2: w0 = ") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("4: r0 = *(u32 *)(r0 +0)") +__xlated("5: exit") +__success +__naked int nocsr_max_stack_ok(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK), + __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), + __imm(bpf_get_smp_processor_id) + : __clobber_all + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 520") +__failure +__naked int nocsr_max_stack_fail(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + /* call to prandom blocks nocsr rewrite */ + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK), + __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), + __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +char _license[] SEC("license") = "GPL"; From 6b376e7543dd15faec4e3878bc1a187126532985 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 23 Jul 2024 20:57:43 +0000 Subject: [PATCH 0254/1052] selftests/bpf: Make %.test.d prerequisite order only %.test.o should depend on %.test.d order-only to avoid unnecessary recompilations due to compiler dumping .d and .o files in random order. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/all/gSoCpn9qV5K0hRvrvYlrw2StRntsvZcrUuDfkZUh1Ang9E6yZ9XJGYDuIP9iCuM2YTVhSEzEXCteQ94_0uIUjx_mXwupFJt64NJaiMr99a0=@pm.me Link: https://lore.kernel.org/all/FnnOUuDMmf0SebqA1bb0fQIW4vguOZ-VcAlPnPMnmT2lJYxMMxFAhcgh77px8MsPS5Fr01I0YQxLJClEJTFWHdpaTBVSQhlmsVTcEsNQbV4=@pm.me Link: https://lore.kernel.org/bpf/yyjJRl5LODbI4-FseU0wIP5e4ik0zAy7Sy-5eGwrzG_UanI8rwWlQPfXAFnn_27hoZFogoUHRSWxFsLk7hPr0b6P5TZ3cRrM30_ggnu555M=@pm.me --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 888ba68e6592..083f413a9c98 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -598,7 +598,7 @@ endif # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TESTS_DIR)/%.c \ - $(TRUNNER_OUTPUT)/%.test.d + | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) From ec4fe2f0fa12fd2d0115df7e58414dc26899cc5e Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:28 -0700 Subject: [PATCH 0255/1052] selftests/bpf: Use pid_t consistently in test_progs.c Use pid_t rather than __pid_t when allocating memory for 'worker_pids' in 'struct test_env', as this is its declared type and also avoids compile errors seen building against musl libc on mipsel64: test_progs.c:1738:49: error: '__pid_t' undeclared (first use in this function); did you mean 'pid_t'? 1738 | env.worker_pids = calloc(sizeof(__pid_t), env.workers); | ^~~~~~~ | pid_t test_progs.c:1738:49: note: each undeclared identifier is reported only once for each function it appears in Fixes: 91b2c0afd00c ("selftests/bpf: Add parallelism to test_progs") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Geliang Tang Link: https://lore.kernel.org/bpf/c6447da51a94babc1931711a43e2ceecb135c93d.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_progs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 89ff704e9dad..60c5ec0f6abf 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1731,7 +1731,7 @@ int main(int argc, char **argv) /* launch workers if requested */ env.worker_id = -1; /* main process */ if (env.workers) { - env.worker_pids = calloc(sizeof(__pid_t), env.workers); + env.worker_pids = calloc(sizeof(pid_t), env.workers); env.worker_socks = calloc(sizeof(int), env.workers); if (env.debug) fprintf(stdout, "Launching %d workers.\n", env.workers); From d393f9479d4aaab0fa4c3caf513f28685e831f13 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:29 -0700 Subject: [PATCH 0256/1052] selftests/bpf: Fix compile error from rlim_t in sk_storage_map.c Cast 'rlim_t' argument to match expected type of printf() format and avoid compile errors seen building for mips64el/musl-libc: In file included from map_tests/sk_storage_map.c:20: map_tests/sk_storage_map.c: In function 'test_sk_storage_map_stress_free': map_tests/sk_storage_map.c:414:56: error: format '%lu' expects argument of type 'long unsigned int', but argument 2 has type 'rlim_t' {aka 'long long unsigned int'} [-Werror=format=] 414 | CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d", | ^~~~~~~~~~~~~~~~~~~~~~~ 415 | rlim_new.rlim_cur, errno); | ~~~~~~~~~~~~~~~~~ | | | rlim_t {aka long long unsigned int} ./test_maps.h:12:24: note: in definition of macro 'CHECK' 12 | printf(format); \ | ^~~~~~ map_tests/sk_storage_map.c:414:68: note: format string is defined here 414 | CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d", | ~~^ | | | long unsigned int | %llu cc1: all warnings being treated as errors Fixes: 51a0e301a563 ("bpf: Add BPF_MAP_TYPE_SK_STORAGE test to test_maps") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1e00a1fa7acf91b4ca135c4102dc796d518bad86.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/map_tests/sk_storage_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c index 18405c3b7cee..af10c309359a 100644 --- a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c +++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c @@ -412,7 +412,7 @@ static void test_sk_storage_map_stress_free(void) rlim_new.rlim_max = rlim_new.rlim_cur + 128; err = setrlimit(RLIMIT_NOFILE, &rlim_new); CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d", - rlim_new.rlim_cur, errno); + (unsigned long) rlim_new.rlim_cur, errno); } err = do_sk_storage_map_stress_free(); From 7b10f0c227ce3fa055d601f058dc411092a62a78 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:30 -0700 Subject: [PATCH 0257/1052] selftests/bpf: Fix error compiling bpf_iter_setsockopt.c with musl libc Existing code calls getsockname() with a 'struct sockaddr_in6 *' argument where a 'struct sockaddr *' argument is declared, yielding compile errors when building for mips64el/musl-libc: bpf_iter_setsockopt.c: In function 'get_local_port': bpf_iter_setsockopt.c:98:30: error: passing argument 2 of 'getsockname' from incompatible pointer type [-Werror=incompatible-pointer-types] 98 | if (!getsockname(fd, &addr, &addrlen)) | ^~~~~ | | | struct sockaddr_in6 * In file included from .../netinet/in.h:10, from .../arpa/inet.h:9, from ./test_progs.h:17, from bpf_iter_setsockopt.c:5: .../sys/socket.h:391:23: note: expected 'struct sockaddr * restrict' but argument is of type 'struct sockaddr_in6 *' 391 | int getsockname (int, struct sockaddr *__restrict, socklen_t *__restrict); | ^ cc1: all warnings being treated as errors This compiled under glibc only because the argument is declared to be a "funky" transparent union which includes both types above. Explicitly cast the argument to allow compiling for both musl and glibc. Fixes: eed92afdd14c ("bpf: selftest: Test batching and bpf_(get|set)sockopt in bpf tcp iter") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Geliang Tang Link: https://lore.kernel.org/bpf/f41def0f17b27a23b1709080e4e3f37f4cc11ca9.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c index b52ff8ce34db..16bed9dd8e6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c @@ -95,7 +95,7 @@ static unsigned short get_local_port(int fd) struct sockaddr_in6 addr; socklen_t addrlen = sizeof(addr); - if (!getsockname(fd, &addr, &addrlen)) + if (!getsockname(fd, (struct sockaddr *)&addr, &addrlen)) return ntohs(addr.sin6_port); return 0; From 69f409469c9b1515a5db40d5a36fda372376fa2d Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:31 -0700 Subject: [PATCH 0258/1052] selftests/bpf: Drop unneeded error.h includes The addition of general support for unprivileged tests in test_loader.c breaks building test_verifier on non-glibc (e.g. musl) systems, due to the inclusion of glibc extension '' in 'unpriv_helpers.c'. However, the header is actually not needed, so remove it to restore building. Similarly for sk_lookup.c and flow_dissector.c, error.h is not necessary and causes problems, so drop them. Fixes: 1d56ade032a4 ("selftests/bpf: Unprivileged tests for test_loader.c") Fixes: 0ab5539f8584 ("selftests/bpf: Tests for BPF_SK_LOOKUP attach point") Fixes: 0905beec9f52 ("selftests/bpf: run flow dissector tests in skb-less mode") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/5664367edf5fea4f3f4b4aec3b182bcfc6edff9c.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 - tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 1 - tools/testing/selftests/bpf/unpriv_helpers.c | 1 - 3 files changed, 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 9e5f38739104..9625e6d21791 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 96f3863da8bc..023c31bde229 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/testing/selftests/bpf/unpriv_helpers.c b/tools/testing/selftests/bpf/unpriv_helpers.c index b6d016461fb0..220f6a963813 100644 --- a/tools/testing/selftests/bpf/unpriv_helpers.c +++ b/tools/testing/selftests/bpf/unpriv_helpers.c @@ -2,7 +2,6 @@ #include #include -#include #include #include #include From d44c93fc2f5a0c47b23fa03d374e45259abd92d2 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:34 -0700 Subject: [PATCH 0259/1052] selftests/bpf: Fix missing ARRAY_SIZE() definition in bench.c Add a "bpf_util.h" include to avoid the following error seen compiling for mips64el with musl libc: bench.c: In function 'find_benchmark': bench.c:590:25: error: implicit declaration of function 'ARRAY_SIZE' [-Werror=implicit-function-declaration] 590 | for (i = 0; i < ARRAY_SIZE(benchs); i++) { | ^~~~~~~~~~ cc1: all warnings being treated as errors Fixes: 8e7c2a023ac0 ("selftests/bpf: Add benchmark runner infrastructure") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/bc4dde77dfcd17a825d8f28f72f3292341966810.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/bench.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 627b74ae041b..90dc3aca32bd 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -10,6 +10,7 @@ #include #include #include "bench.h" +#include "bpf_util.h" #include "testing_helpers.h" struct env env = { From a2c155131b710959beb508ca6a54769b6b1bd488 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:35 -0700 Subject: [PATCH 0260/1052] selftests/bpf: Fix missing UINT_MAX definitions in benchmarks Include in 'bench.h' to provide a UINT_MAX definition and avoid multiple compile errors against mips64el/musl-libc like: benchs/bench_local_storage.c: In function 'parse_arg': benchs/bench_local_storage.c:40:38: error: 'UINT_MAX' undeclared (first use in this function) 40 | if (ret < 1 || ret > UINT_MAX) { | ^~~~~~~~ benchs/bench_local_storage.c:11:1: note: 'UINT_MAX' is defined in header ''; did you forget to '#include '? 10 | #include +++ |+#include 11 | seen with bench_local_storage.c, bench_local_storage_rcu_tasks_trace.c, and bench_bpf_hashmap_lookup.c. Fixes: 73087489250d ("selftests/bpf: Add benchmark for local_storage get") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8f64a9d9fcff40a7fca090a65a68a9b62a468e16.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/bench.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 68180d8f8558..005c401b3e22 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -10,6 +10,7 @@ #include #include #include +#include struct cpu_set { bool *cpus; From 6495eb79ca7d15bd87c38d77307e8f9b6b7bf4ef Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:36 -0700 Subject: [PATCH 0261/1052] selftests/bpf: Fix missing BUILD_BUG_ON() declaration Explicitly include '' to fix errors seen compiling with gcc targeting mips64el/musl-libc: user_ringbuf.c: In function 'test_user_ringbuf_loop': user_ringbuf.c:426:9: error: implicit declaration of function 'BUILD_BUG_ON' [-Werror=implicit-function-declaration] 426 | BUILD_BUG_ON(total_samples <= c_max_entries); | ^~~~~~~~~~~~ cc1: all warnings being treated as errors Fixes: e5a9df51c746 ("selftests/bpf: Add selftests validating the user ringbuf") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b28575f9221ec54871c46a2e87612bb4bbf46ccd.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/user_ringbuf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c index e51721df14fc..dfff6feac12c 100644 --- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -4,6 +4,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include From 21f0b0af977203220ad58aff95e372151288ec47 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:37 -0700 Subject: [PATCH 0262/1052] selftests/bpf: Fix include of Update ns_current_pid_tgid.c to use '#include ' and avoid compile error against mips64el/musl libc: In file included from .../prog_tests/ns_current_pid_tgid.c:14: .../include/sys/fcntl.h:1:2: error: #warning redirecting incorrect #include to [-Werror=cpp] 1 | #warning redirecting incorrect #include to | ^~~~~~~ cc1: all warnings being treated as errors Fixes: 09c02d553c49 ("bpf, selftests: Fold test_current_pid_tgid_new_ns into test_progs.") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8bdc869749177b575025bf69600a4ce591822609.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index e72d75d6baa7..c29787e092d6 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include "network_helpers.h" #define STACK_SIZE (1024 * 1024) From 4c329b99ef9c118343379bde9f97e8ce5cac9fc9 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:38 -0700 Subject: [PATCH 0263/1052] selftests/bpf: Fix compiling parse_tcp_hdr_opt.c with musl-libc The GNU version of 'struct tcphdr', with members 'doff' and 'urg_ptr', is not exposed by musl headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: parse_tcp_hdr_opt.c:18:21: error: 'struct tcphdr' has no member named 'urg_ptr' 18 | .pk6_v6.tcp.urg_ptr = 123, | ^~~~~~~ parse_tcp_hdr_opt.c:19:21: error: 'struct tcphdr' has no member named 'doff' 19 | .pk6_v6.tcp.doff = 9, /* 16 bytes of options */ | ^~~~ Fixes: cfa7b011894d ("selftests/bpf: tests for using dynptrs to parse skb and xdp buffers") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/ac5440213c242c62cb4e0d9e0a9cd5058b6a31f6.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c index daa952711d8f..e9c07d561ded 100644 --- a/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c +++ b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include #include "test_parse_tcp_hdr_opt.skel.h" From bae9a5ce7d3a9b3a9e07b31ab9e9c58450e3e9fd Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:39 -0700 Subject: [PATCH 0264/1052] selftests/bpf: Fix compiling kfree_skb.c with musl-libc The GNU version of 'struct tcphdr' with member 'doff' is not exposed by musl headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: In file included from kfree_skb.c:2: kfree_skb.c: In function 'on_sample': kfree_skb.c:45:30: error: 'struct tcphdr' has no member named 'doff' 45 | if (CHECK(pkt_v6->tcp.doff != 5, "check_tcp", | ^ Fixes: 580d656d80cf ("selftests/bpf: Add kfree_skb raw_tp test") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e2d8cedc790959c10d6822a51f01a7a3616bea1b.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/kfree_skb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index c07991544a78..34f8822fd221 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include #include "kfree_skb.skel.h" From 5e4c43bcb85973243d7274e0058b6e8f5810e4f7 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:40 -0700 Subject: [PATCH 0265/1052] selftests/bpf: Fix compiling flow_dissector.c with musl-libc The GNU version of 'struct tcphdr' has members 'doff', 'source' and 'dest', which are not exposed by musl libc headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: flow_dissector.c:118:30: error: 'struct tcphdr' has no member named 'doff' 118 | .tcp.doff = 5, | ^~~~ flow_dissector.c:119:30: error: 'struct tcphdr' has no member named 'source' 119 | .tcp.source = 80, | ^~~~~~ flow_dissector.c:120:30: error: 'struct tcphdr' has no member named 'dest' 120 | .tcp.dest = 8080, | ^~~~ Fixes: ae173a915785 ("selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8f7ab21a73f678f9cebd32b26c444a686e57414d.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 9625e6d21791..3171047414a7 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include #include From 18826fb0b79c3c3cd1fe765d85f9c6f1a902c722 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:41 -0700 Subject: [PATCH 0266/1052] selftests/bpf: Fix compiling tcp_rtt.c with musl-libc The GNU version of 'struct tcp_info' in 'netinet/tcp.h' is not exposed by musl headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: tcp_rtt.c: In function 'wait_for_ack': tcp_rtt.c:24:25: error: storage size of 'info' isn't known 24 | struct tcp_info info; | ^~~~ tcp_rtt.c:24:25: error: unused variable 'info' [-Werror=unused-variable] cc1: all warnings being treated as errors Fixes: 1f4f80fed217 ("selftests/bpf: test_progs: convert test_tcp_rtt") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/f2329767b15df206f08a5776d35a47c37da855ae.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index f2b99d95d916..c38784c1c066 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include "cgroup_helpers.h" #include "network_helpers.h" From debfa4f628f271f72933bf38d581cc53cfe1def5 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:42 -0700 Subject: [PATCH 0267/1052] selftests/bpf: Fix compiling core_reloc.c with musl-libc The type 'loff_t' is a GNU extension and not exposed by the musl 'fcntl.h' header unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: In file included from tools/testing/selftests/bpf/prog_tests/core_reloc.c:4: ./bpf_testmod/bpf_testmod.h:10:9: error: unknown type name 'loff_t' 10 | loff_t off; | ^~~~~~ ./bpf_testmod/bpf_testmod.h:16:9: error: unknown type name 'loff_t' 16 | loff_t off; | ^~~~~~ Fixes: 6bcd39d366b6 ("selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/11c3af75a7eb6bcb7ad9acfae6a6f470c572eb82.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 47f42e680105..26019313e1fc 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include "progs/core_reloc_types.h" #include "bpf_testmod/bpf_testmod.h" From 27c4797ce51c8dd51e35e68e9024a892f62d78b2 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:43 -0700 Subject: [PATCH 0268/1052] selftests/bpf: Fix errors compiling lwt_redirect.c with musl libc Remove a redundant include of '' which is already provided in 'lwt_helpers.h'. This avoids errors seen compiling for mips64el/musl-libc: In file included from .../arpa/inet.h:9, from lwt_redirect.c:51: .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from .../linux/icmp.h:24, from lwt_redirect.c:50: .../linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ Fixes: 43a7c3ef8a15 ("selftests/bpf: Add lwt_xmit tests for BPF_REDIRECT") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/3869dda876d5206d2f8d4dd67331c739ceb0c7f8.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/lwt_redirect.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c index 835a1d756c16..b6e8d822e8e9 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include From 1b00f355130a5dfc38a01ad02458ae2cb2ebe609 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:44 -0700 Subject: [PATCH 0269/1052] selftests/bpf: Fix errors compiling decap_sanity.c with musl libc Remove a redundant include of '', whose needed definitions are already provided by 'test_progs.h'. This avoids errors seen compiling for mips64el/musl-libc: In file included from .../arpa/inet.h:9, from ./test_progs.h:17, from prog_tests/decap_sanity.c:9: .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from decap_sanity.c:7: .../linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ Fixes: 70a00e2f1dba ("selftests/bpf: Test bpf_skb_adjust_room on CHECKSUM_PARTIAL") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e986ba2d7edccd254b54f7cd049b98f10bafa8c3.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/decap_sanity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c index dcb9e5070cc3..d79f398ec6b7 100644 --- a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c +++ b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c @@ -4,7 +4,6 @@ #include #include #include -#include #include "test_progs.h" #include "network_helpers.h" From 9822be702fe6e1c3e0933ef4b68a8c56683d930d Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:45 -0700 Subject: [PATCH 0270/1052] selftests/bpf: Fix errors compiling crypto_sanity.c with musl libc Remove a redundant include of '', whose needed definitions are already provided by 'test_progs.h'. This avoids errors seen compiling for mips64el/musl-libc: In file included from .../arpa/inet.h:9, from ./test_progs.h:17, from prog_tests/crypto_sanity.c:10: .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from crypto_sanity.c:7: .../linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ Fixes: 91541ab192fc ("selftests: bpf: crypto skcipher algo selftests") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/911293968f424ad7b462d8805aeb3baee8f4985b.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/crypto_sanity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c index b1a3a49a822a..42bd07f7218d 100644 --- a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c +++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include "test_progs.h" From 730561d3c08d4a327cceaabf11365958a1c00cec Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:46 -0700 Subject: [PATCH 0271/1052] selftests/bpf: Fix errors compiling cg_storage_multi.h with musl libc Remove a redundant include of '', whose needed definitions are already included (via '') in cg_storage_multi_egress_only.c, cg_storage_multi_isolated.c, and cg_storage_multi_shared.c. This avoids redefinition errors seen compiling for mips64el/musl-libc like: In file included from progs/cg_storage_multi_egress_only.c:13: In file included from progs/cg_storage_multi.h:6: In file included from /usr/mips64el-linux-gnuabi64/include/asm/types.h:23: /usr/include/asm-generic/int-l64.h:29:25: error: typedef redefinition with different types ('long' vs 'long long') 29 | typedef __signed__ long __s64; | ^ /usr/include/asm-generic/int-ll64.h:30:44: note: previous definition is here 30 | __extension__ typedef __signed__ long long __s64; | ^ Fixes: 9e5bd1f7633b ("selftests/bpf: Test CGROUP_STORAGE map can't be used by multiple progs") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/4f4702e9f6115b7f84fea01b2326ca24c6df7ba8.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/progs/cg_storage_multi.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi.h b/tools/testing/selftests/bpf/progs/cg_storage_multi.h index a0778fe7857a..41d59f0ee606 100644 --- a/tools/testing/selftests/bpf/progs/cg_storage_multi.h +++ b/tools/testing/selftests/bpf/progs/cg_storage_multi.h @@ -3,8 +3,6 @@ #ifndef __PROGS_CG_STORAGE_MULTI_H #define __PROGS_CG_STORAGE_MULTI_H -#include - struct cgroup_value { __u32 egress_pkts; __u32 ingress_pkts; From 04a94133f1b3cccb19e056c26f056c50b4e5b3b1 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 24 Jul 2024 12:14:58 -0500 Subject: [PATCH 0272/1052] libbpf: Don't take direct pointers into BTF data from st_ops In struct bpf_struct_ops, we have take a pointer to a BTF type name, and a struct btf_type. This was presumably done for convenience, but can actually result in subtle and confusing bugs given that BTF data can be invalidated before a program is loaded. For example, in sched_ext, we may sometimes resize a data section after a skeleton has been opened, but before the struct_ops scheduler map has been loaded. This may cause the BTF data to be realloc'd, which can then cause a UAF when loading the program because the struct_ops map has pointers directly into the BTF data. We're already storing the BTF type_id in struct bpf_struct_ops. Because type_id is stable, we can therefore just update the places where we were looking at those pointers to instead do the lookups we need from the type_id. Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support") Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240724171459.281234-1-void@manifault.com --- tools/lib/bpf/libbpf.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a3be6f8fac09..e55353887439 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -496,8 +496,6 @@ struct bpf_program { }; struct bpf_struct_ops { - const char *tname; - const struct btf_type *type; struct bpf_program **progs; __u32 *kern_func_off; /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */ @@ -1083,11 +1081,14 @@ static int bpf_object_adjust_struct_ops_autoload(struct bpf_object *obj) continue; for (j = 0; j < obj->nr_maps; ++j) { + const struct btf_type *type; + map = &obj->maps[j]; if (!bpf_map__is_struct_ops(map)) continue; - vlen = btf_vlen(map->st_ops->type); + type = btf__type_by_id(obj->btf, map->st_ops->type_id); + vlen = btf_vlen(type); for (k = 0; k < vlen; ++k) { slot_prog = map->st_ops->progs[k]; if (prog != slot_prog) @@ -1121,8 +1122,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) int err; st_ops = map->st_ops; - type = st_ops->type; - tname = st_ops->tname; + type = btf__type_by_id(btf, st_ops->type_id); + tname = btf__name_by_offset(btf, type->name_off); err = find_struct_ops_kern_types(obj, tname, &mod_btf, &kern_type, &kern_type_id, &kern_vtype, &kern_vtype_id, @@ -1423,8 +1424,6 @@ static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name, memcpy(st_ops->data, data->d_buf + vsi->offset, type->size); - st_ops->tname = tname; - st_ops->type = type; st_ops->type_id = type_id; pr_debug("struct_ops init: struct %s(type_id=%u) %s found at offset %u\n", @@ -8445,11 +8444,13 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, static void bpf_map_prepare_vdata(const struct bpf_map *map) { + const struct btf_type *type; struct bpf_struct_ops *st_ops; __u32 i; st_ops = map->st_ops; - for (i = 0; i < btf_vlen(st_ops->type); i++) { + type = btf__type_by_id(map->obj->btf, st_ops->type_id); + for (i = 0; i < btf_vlen(type); i++) { struct bpf_program *prog = st_ops->progs[i]; void *kern_data; int prog_fd; @@ -9712,6 +9713,7 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data) { + const struct btf_type *type; const struct btf_member *member; struct bpf_struct_ops *st_ops; struct bpf_program *prog; @@ -9771,13 +9773,14 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, } insn_idx = sym->st_value / BPF_INSN_SZ; - member = find_member_by_offset(st_ops->type, moff * 8); + type = btf__type_by_id(btf, st_ops->type_id); + member = find_member_by_offset(type, moff * 8); if (!member) { pr_warn("struct_ops reloc %s: cannot find member at moff %u\n", map->name, moff); return -EINVAL; } - member_idx = member - btf_members(st_ops->type); + member_idx = member - btf_members(type); name = btf__name_by_offset(btf, member->name_off); if (!resolve_func_ptr(btf, member->type, NULL)) { From 0bfdda9db88990539b63a11028c84c4c069e8a0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Sun, 21 Jul 2024 21:33:03 +0200 Subject: [PATCH 0273/1052] selftests/bpf: Update xdp_redirect_map prog sections for libbpf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xdp_redirect_map.c is a bpf program used by test_xdp_veth.sh, which is not handled by the generic test runner (test_progs). To allow converting this test to test_progs, the corresponding program must be updated to allow handling it through skeletons generated by bpftool and libbpf. Update programs section names to allow to manipulate those with libbpf. Signed-off-by: Alexis Lothoré (eBPF Foundation) Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240721-convert_test_xdp_veth-v4-1-23bdba21b2f9@bootlin.com Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/xdp_redirect_map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c index d037262c8937..682dda8dabbc 100644 --- a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c +++ b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c @@ -10,19 +10,19 @@ struct { __uint(value_size, sizeof(int)); } tx_port SEC(".maps"); -SEC("redirect_map_0") +SEC("xdp") int xdp_redirect_map_0(struct xdp_md *xdp) { return bpf_redirect_map(&tx_port, 0, 0); } -SEC("redirect_map_1") +SEC("xdp") int xdp_redirect_map_1(struct xdp_md *xdp) { return bpf_redirect_map(&tx_port, 1, 0); } -SEC("redirect_map_2") +SEC("xdp") int xdp_redirect_map_2(struct xdp_md *xdp) { return bpf_redirect_map(&tx_port, 2, 0); From 41b01a0271fd0387171eb9ad4692c22e37c8c80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Sun, 21 Jul 2024 21:33:04 +0200 Subject: [PATCH 0274/1052] selftests/bpf: Integrate test_xdp_veth into test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_xdp_veth.sh tests that XDP return codes work as expected, by bringing up multiple veth pairs isolated in different namespaces, attaching specific xdp programs to each interface, and ensuring that the whole chain allows to ping one end interface from the first one. The test runs well but is currently not integrated in test_progs, which prevents it from being run automatically in the CI infrastructure. Rewrite it as a C test relying on libbpf to allow running it in the CI infrastructure. The new code brings up the same network infrastructure and reuses the same eBPF programs as test_xdp_veth.sh, for which skeletons are already generated by the bpf tests makefile. Signed-off-by: Alexis Lothoré (eBPF Foundation) Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240721-convert_test_xdp_veth-v4-2-23bdba21b2f9@bootlin.com Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/Makefile | 1 - .../selftests/bpf/prog_tests/test_xdp_veth.c | 213 ++++++++++++++++++ tools/testing/selftests/bpf/test_xdp_veth.sh | 121 ---------- 3 files changed, 213 insertions(+), 122 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c delete mode 100755 tools/testing/selftests/bpf/test_xdp_veth.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 083f413a9c98..774c6270e377 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -115,7 +115,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ - test_xdp_veth.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c new file mode 100644 index 000000000000..8d75424fe6bc --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Create 3 namespaces with 3 veth peers, and forward packets in-between using + * native XDP + * + * XDP_TX + * NS1(veth11) NS2(veth22) NS3(veth33) + * | | | + * | | | + * (veth1, (veth2, (veth3, + * id:111) id:122) id:133) + * ^ | ^ | ^ | + * | | XDP_REDIRECT | | XDP_REDIRECT | | + * | ------------------ ------------------ | + * ----------------------------------------- + * XDP_REDIRECT + */ + +#define _GNU_SOURCE +#include +#include "test_progs.h" +#include "network_helpers.h" +#include "xdp_dummy.skel.h" +#include "xdp_redirect_map.skel.h" +#include "xdp_tx.skel.h" + +#define VETH_PAIRS_COUNT 3 +#define NS_SUFFIX_LEN 6 +#define VETH_NAME_MAX_LEN 16 +#define IP_SRC "10.1.1.11" +#define IP_DST "10.1.1.33" +#define IP_CMD_MAX_LEN 128 + +struct skeletons { + struct xdp_dummy *xdp_dummy; + struct xdp_tx *xdp_tx; + struct xdp_redirect_map *xdp_redirect_maps; +}; + +struct veth_configuration { + char local_veth[VETH_NAME_MAX_LEN]; /* Interface in main namespace */ + char remote_veth[VETH_NAME_MAX_LEN]; /* Peer interface in dedicated namespace*/ + const char *namespace; /* Namespace for the remote veth */ + char next_veth[VETH_NAME_MAX_LEN]; /* Local interface to redirect traffic to */ + char *remote_addr; /* IP address of the remote veth */ +}; + +static struct veth_configuration config[VETH_PAIRS_COUNT] = { + { + .local_veth = "veth1", + .remote_veth = "veth11", + .next_veth = "veth2", + .remote_addr = IP_SRC, + .namespace = "ns-veth11" + }, + { + .local_veth = "veth2", + .remote_veth = "veth22", + .next_veth = "veth3", + .remote_addr = NULL, + .namespace = "ns-veth22" + }, + { + .local_veth = "veth3", + .remote_veth = "veth33", + .next_veth = "veth1", + .remote_addr = IP_DST, + .namespace = "ns-veth33" + } +}; + +static int attach_programs_to_veth_pair(struct skeletons *skeletons, int index) +{ + struct bpf_program *local_prog, *remote_prog; + struct bpf_link **local_link, **remote_link; + struct nstoken *nstoken; + struct bpf_link *link; + int interface; + + switch (index) { + case 0: + local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_0; + local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_0; + remote_prog = skeletons->xdp_dummy->progs.xdp_dummy_prog; + remote_link = &skeletons->xdp_dummy->links.xdp_dummy_prog; + break; + case 1: + local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_1; + local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_1; + remote_prog = skeletons->xdp_tx->progs.xdp_tx; + remote_link = &skeletons->xdp_tx->links.xdp_tx; + break; + case 2: + local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_2; + local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_2; + remote_prog = skeletons->xdp_dummy->progs.xdp_dummy_prog; + remote_link = &skeletons->xdp_dummy->links.xdp_dummy_prog; + break; + } + interface = if_nametoindex(config[index].local_veth); + if (!ASSERT_NEQ(interface, 0, "non zero interface index")) + return -1; + link = bpf_program__attach_xdp(local_prog, interface); + if (!ASSERT_OK_PTR(link, "attach xdp program to local veth")) + return -1; + *local_link = link; + nstoken = open_netns(config[index].namespace); + if (!ASSERT_OK_PTR(nstoken, "switch to remote veth namespace")) + return -1; + interface = if_nametoindex(config[index].remote_veth); + if (!ASSERT_NEQ(interface, 0, "non zero interface index")) { + close_netns(nstoken); + return -1; + } + link = bpf_program__attach_xdp(remote_prog, interface); + *remote_link = link; + close_netns(nstoken); + if (!ASSERT_OK_PTR(link, "attach xdp program to remote veth")) + return -1; + + return 0; +} + +static int configure_network(struct skeletons *skeletons) +{ + int interface_id; + int map_fd; + int err; + int i = 0; + + /* First create and configure all interfaces */ + for (i = 0; i < VETH_PAIRS_COUNT; i++) { + SYS(fail, "ip netns add %s", config[i].namespace); + SYS(fail, "ip link add %s type veth peer name %s netns %s", + config[i].local_veth, config[i].remote_veth, config[i].namespace); + SYS(fail, "ip link set dev %s up", config[i].local_veth); + if (config[i].remote_addr) + SYS(fail, "ip -n %s addr add %s/24 dev %s", config[i].namespace, + config[i].remote_addr, config[i].remote_veth); + SYS(fail, "ip -n %s link set dev %s up", config[i].namespace, + config[i].remote_veth); + } + + /* Then configure the redirect map and attach programs to interfaces */ + map_fd = bpf_map__fd(skeletons->xdp_redirect_maps->maps.tx_port); + if (!ASSERT_GE(map_fd, 0, "open redirect map")) + goto fail; + for (i = 0; i < VETH_PAIRS_COUNT; i++) { + interface_id = if_nametoindex(config[i].next_veth); + if (!ASSERT_NEQ(interface_id, 0, "non zero interface index")) + goto fail; + err = bpf_map_update_elem(map_fd, &i, &interface_id, BPF_ANY); + if (!ASSERT_OK(err, "configure interface redirection through map")) + goto fail; + if (attach_programs_to_veth_pair(skeletons, i)) + goto fail; + } + + return 0; + +fail: + return -1; +} + +static void cleanup_network(void) +{ + int i; + + /* Deleting namespaces is enough to automatically remove veth pairs as well + */ + for (i = 0; i < VETH_PAIRS_COUNT; i++) + SYS_NOFAIL("ip netns del %s", config[i].namespace); +} + +static int check_ping(struct skeletons *skeletons) +{ + /* Test: if all interfaces are properly configured, we must be able to ping + * veth33 from veth11 + */ + return SYS_NOFAIL("ip netns exec %s ping -c 1 -W 1 %s > /dev/null", + config[0].namespace, IP_DST); +} + +void test_xdp_veth_redirect(void) +{ + struct skeletons skeletons = {}; + + skeletons.xdp_dummy = xdp_dummy__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_dummy, "xdp_dummy__open_and_load")) + return; + + skeletons.xdp_tx = xdp_tx__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_tx, "xdp_tx__open_and_load")) + goto destroy_xdp_dummy; + + skeletons.xdp_redirect_maps = xdp_redirect_map__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_redirect_maps, "xdp_redirect_map__open_and_load")) + goto destroy_xdp_tx; + + if (configure_network(&skeletons)) + goto destroy_xdp_redirect_map; + + ASSERT_OK(check_ping(&skeletons), "ping"); + +destroy_xdp_redirect_map: + xdp_redirect_map__destroy(skeletons.xdp_redirect_maps); +destroy_xdp_tx: + xdp_tx__destroy(skeletons.xdp_tx); +destroy_xdp_dummy: + xdp_dummy__destroy(skeletons.xdp_dummy); + + cleanup_network(); +} diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh deleted file mode 100755 index 5211ca9a0239..000000000000 --- a/tools/testing/selftests/bpf/test_xdp_veth.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Create 3 namespaces with 3 veth peers, and -# forward packets in-between using native XDP -# -# XDP_TX -# NS1(veth11) NS2(veth22) NS3(veth33) -# | | | -# | | | -# (veth1, (veth2, (veth3, -# id:111) id:122) id:133) -# ^ | ^ | ^ | -# | | XDP_REDIRECT | | XDP_REDIRECT | | -# | ------------------ ------------------ | -# ----------------------------------------- -# XDP_REDIRECT - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -TESTNAME=xdp_veth -BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_DIR=$BPF_FS/test_$TESTNAME -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" -readonly NS3="ns3-$(mktemp -u XXXXXX)" - -_cleanup() -{ - set +e - ip link del veth1 2> /dev/null - ip link del veth2 2> /dev/null - ip link del veth3 2> /dev/null - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null - ip netns del ${NS3} 2> /dev/null - rm -rf $BPF_DIR 2> /dev/null -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -if [ $(id -u) -ne 0 ]; then - echo "selftests: $TESTNAME [SKIP] Need root privileges" - exit $ksft_skip -fi - -if ! ip link set dev lo xdp off > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without the ip xdp support" - exit $ksft_skip -fi - -if [ -z "$BPF_FS" ]; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" - exit $ksft_skip -fi - -if ! bpftool version > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" - exit $ksft_skip -fi - -set -e - -trap cleanup_skip EXIT - -ip netns add ${NS1} -ip netns add ${NS2} -ip netns add ${NS3} - -ip link add veth1 index 111 type veth peer name veth11 netns ${NS1} -ip link add veth2 index 122 type veth peer name veth22 netns ${NS2} -ip link add veth3 index 133 type veth peer name veth33 netns ${NS3} - -ip link set veth1 up -ip link set veth2 up -ip link set veth3 up - -ip -n ${NS1} addr add 10.1.1.11/24 dev veth11 -ip -n ${NS3} addr add 10.1.1.33/24 dev veth33 - -ip -n ${NS1} link set dev veth11 up -ip -n ${NS2} link set dev veth22 up -ip -n ${NS3} link set dev veth33 up - -mkdir $BPF_DIR -bpftool prog loadall \ - xdp_redirect_map.bpf.o $BPF_DIR/progs type xdp \ - pinmaps $BPF_DIR/maps -bpftool map update pinned $BPF_DIR/maps/tx_port key 0 0 0 0 value 122 0 0 0 -bpftool map update pinned $BPF_DIR/maps/tx_port key 1 0 0 0 value 133 0 0 0 -bpftool map update pinned $BPF_DIR/maps/tx_port key 2 0 0 0 value 111 0 0 0 -ip link set dev veth1 xdp pinned $BPF_DIR/progs/xdp_redirect_map_0 -ip link set dev veth2 xdp pinned $BPF_DIR/progs/xdp_redirect_map_1 -ip link set dev veth3 xdp pinned $BPF_DIR/progs/xdp_redirect_map_2 - -ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.bpf.o sec xdp -ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.bpf.o sec xdp -ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.bpf.o sec xdp - -trap cleanup EXIT - -ip netns exec ${NS1} ping -c 1 -W 1 10.1.1.33 - -exit 0 From ba71ffb660e4d41fe589f2459fb888ea61fdb310 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 24 Jul 2024 22:22:14 -0500 Subject: [PATCH 0275/1052] selftests/bpf: Load struct_ops map in global_maps_resize test In prog_tests/test_global_maps_resize.c, we test various use cases for resizing global maps. Commit 7244100e0389 ("libbpf: Don't take direct pointers into BTF data from st_ops") updated libbpf to not store pointers to volatile BTF data, which for some users, was causing a UAF when resizing a datasec array. Let's ensure we have coverage for resizing datasec arrays with struct_ops progs by also including a struct_ops map and struct_ops prog in the test_global_map_resize skeleton. The map is automatically loaded, so we don't need to do anything other than add it to the BPF prog being tested to get the coverage. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240725032214.50676-1-void@manifault.com --- .../selftests/bpf/progs/test_global_map_resize.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c index 1fbb73d3e5d5..714b29c7f8b2 100644 --- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c +++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c @@ -3,6 +3,7 @@ #include "vmlinux.h" #include +#include char _license[] SEC("license") = "GPL"; @@ -60,3 +61,18 @@ int data_array_sum(void *ctx) return 0; } + +SEC("struct_ops/test_1") +int BPF_PROG(test_1) +{ + return 0; +} + +struct bpf_testmod_ops { + int (*test_1)(void); +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops st_ops_resize = { + .test_1 = (void *)test_1 +}; From 94ede2a3e9135764736221c080ac7c0ad993dc2d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 16:34:17 -0700 Subject: [PATCH 0276/1052] profiling: remove stale percpu flip buffer variables For some reason I didn't see this issue on my arm64 or x86-64 builds, but Stephen Rothwell reports that commit 2accfdb7eff6 ("profiling: attempt to remove per-cpu profile flip buffer") left these static variables around, and the powerpc build is unhappy about them: kernel/profile.c:52:28: warning: 'cpu_profile_flip' defined but not used [-Wunused-variable] 52 | static DEFINE_PER_CPU(int, cpu_profile_flip); | ^~~~~~~~~~~~~~~~ .. So remove these stale left-over remnants too. Fixes: 2accfdb7eff6 ("profiling: attempt to remove per-cpu profile flip buffer") Reported-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/profile.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/profile.c b/kernel/profile.c index 2dfb0f4e755c..ff68d3816182 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,12 +47,6 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ - int profile_setup(char *str) { static const char schedstr[] = "schedule"; From d7b5f7537c8282e1e1919408d0b6c69877fd35f8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 11:53:50 +0200 Subject: [PATCH 0277/1052] media: ipu-bridge: fix ipu6 Kconfig dependencies Commit 4670c8c3fb04 ("media: ipu-bridge: Fix Kconfig dependencies") changed how IPU_BRIDGE dependencies are handled for all drivers, but the IPU6 variant was added the old way, which causes build time warnings when I2C is turned off: WARNING: unmet direct dependencies detected for IPU_BRIDGE Depends on [n]: MEDIA_SUPPORT [=m] && PCI [=y] && MEDIA_PCI_SUPPORT [=y] && (ACPI [=y] || COMPILE_TEST [=y]) && I2C [=n] Selected by [m]: - VIDEO_INTEL_IPU6 [=m] && MEDIA_SUPPORT [=m] && PCI [=y] && MEDIA_PCI_SUPPORT [=y] && (ACPI [=y] || COMPILE_TEST [=y]) && VIDEO_DEV [=m] && X86 [=y] && X86_64 [=y] && HAS_DMA [=y] To make it consistent with the other IPU drivers as well as avoid this warning, change the 'select' into 'depends on'. Fixes: c70281cc83d6 ("media: intel/ipu6: add Kconfig and Makefile") Signed-off-by: Arnd Bergmann [Sakari Ailus: Alternatively depend on !IPU_BRIDGE.] Cc: stable@vger.kernel.org # for v6.10 Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/pci/intel/ipu6/Kconfig b/drivers/media/pci/intel/ipu6/Kconfig index 154343080c82..b7ab24b89836 100644 --- a/drivers/media/pci/intel/ipu6/Kconfig +++ b/drivers/media/pci/intel/ipu6/Kconfig @@ -3,13 +3,13 @@ config VIDEO_INTEL_IPU6 depends on ACPI || COMPILE_TEST depends on VIDEO_DEV depends on X86 && X86_64 && HAS_DMA + depends on IPU_BRIDGE || !IPU_BRIDGE select DMA_OPS select IOMMU_IOVA select VIDEO_V4L2_SUBDEV_API select MEDIA_CONTROLLER select VIDEOBUF2_DMA_CONTIG select V4L2_FWNODE - select IPU_BRIDGE help This is the 6th Gen Intel Image Processing Unit, found in Intel SoCs and used for capturing images and video from camera sensors. From 423a77ae3a3f916809ff3ab1c8db6d3d580c3120 Mon Sep 17 00:00:00 2001 From: Bingbu Cao Date: Wed, 17 Jul 2024 15:40:50 +0800 Subject: [PATCH 0278/1052] media: intel/ipu6: select AUXILIARY_BUS in Kconfig Intel IPU6 PCI driver need register its devices on auxiliary bus, so it needs to select the AUXILIARY_BUS in Kconfig. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407161833.7BEFXejx-lkp@intel.com/ Fixes: c70281cc83d6 ("media: intel/ipu6: add Kconfig and Makefile") Signed-off-by: Bingbu Cao Cc: stable@vger.kernel.org # for v6.10 Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/pci/intel/ipu6/Kconfig b/drivers/media/pci/intel/ipu6/Kconfig index b7ab24b89836..40e20f0aa5ae 100644 --- a/drivers/media/pci/intel/ipu6/Kconfig +++ b/drivers/media/pci/intel/ipu6/Kconfig @@ -4,6 +4,7 @@ config VIDEO_INTEL_IPU6 depends on VIDEO_DEV depends on X86 && X86_64 && HAS_DMA depends on IPU_BRIDGE || !IPU_BRIDGE + select AUXILIARY_BUS select DMA_OPS select IOMMU_IOVA select VIDEO_V4L2_SUBDEV_API From 914f8961879de6fadd166ebd75151a778481e09a Mon Sep 17 00:00:00 2001 From: Jean-Michel Hautbois Date: Sat, 8 Jun 2024 18:41:27 +0200 Subject: [PATCH 0279/1052] media: v4l: Fix missing tabular column hint for Y14P format The original patch added two columns in the flat-table of Luma-Only Image Formats, without updating hints to latex: above it. This results in wrong column count in the output of Sphinx's latex builder. Fix it. Reported-by: Akira Yokosawa Closes: https://lore.kernel.org/linux-media/bdbc27ba-5098-49fb-aabf-753c81361cc7@gmail.com/ Fixes: adb1d4655e53 ("media: v4l: Add V4L2-PIX-FMT-Y14P format") Cc: stable@vger.kernel.org # for v6.10 Signed-off-by: Jean-Michel Hautbois Signed-off-by: Hans Verkuil --- Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst index f02e6cf3516a..74df19be91f6 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst @@ -21,9 +21,9 @@ are often referred to as greyscale formats. .. raw:: latex - \scriptsize + \tiny -.. tabularcolumns:: |p{3.6cm}|p{3.0cm}|p{1.3cm}|p{2.6cm}|p{1.3cm}|p{1.3cm}|p{1.3cm}| +.. tabularcolumns:: |p{3.6cm}|p{2.4cm}|p{1.3cm}|p{1.3cm}|p{1.3cm}|p{1.3cm}|p{1.3cm}|p{1.3cm}|p{1.3cm}| .. flat-table:: Luma-Only Image Formats :header-rows: 1 From c14e4114582c20276467226387d5bae7310a849e Mon Sep 17 00:00:00 2001 From: Pavan Kumar Paluri Date: Mon, 29 Jul 2024 13:08:08 -0500 Subject: [PATCH 0280/1052] x86/sev: Fix __reserved field in sev_config sev_config currently has debug, ghcbs_initialized, and use_cas fields. However, __reserved count has not been updated. Fix this. Fixes: 34ff65901735 ("x86/sev: Use kernel provided SVSM Calling Areas") Signed-off-by: Pavan Kumar Paluri Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240729180808.366587-1-papaluri@amd.com --- arch/x86/coco/sev/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 082d61d85dfc..de1df0cb45da 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -163,7 +163,7 @@ struct sev_config { */ use_cas : 1, - __reserved : 62; + __reserved : 61; }; static struct sev_config sev_cfg __read_mostly; From 0ce91928ec62d189b5c51816e325f02587b53118 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 17 Jul 2024 16:24:16 +0200 Subject: [PATCH 0281/1052] drm/ast: astdp: Wake up during connector status detection Power up the ASTDP connector for connection status detection if the connector is not active. Keep it powered if a display is attached. This fixes a bug where the connector does not come back after disconnecting the display. The encoder's atomic_disable turns off power on the physical connector. Further HPD reads will fail, thus preventing the driver from detecting re-connected displays. For connectors that are actively used, only test the HPD flag without touching power. Fixes: f81bb0ac7872 ("drm/ast: report connection status on Display Port.") Cc: Jocelyn Falempe Cc: Thomas Zimmermann Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Cc: # v6.6+ Signed-off-by: Thomas Zimmermann Reviewed-by: Jocelyn Falempe Link: https://patchwork.freedesktop.org/patch/msgid/20240717143319.104012-2-tzimmermann@suse.de --- drivers/gpu/drm/ast/ast_dp.c | 7 +++++++ drivers/gpu/drm/ast/ast_drv.h | 1 + drivers/gpu/drm/ast/ast_mode.c | 29 +++++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index 1e9259416980..e6c7f0d64e99 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -158,7 +158,14 @@ void ast_dp_launch(struct drm_device *dev) ASTDP_HOST_EDID_READ_DONE); } +bool ast_dp_power_is_on(struct ast_device *ast) +{ + u8 vgacre3; + vgacre3 = ast_get_index_reg(ast, AST_IO_VGACRI, 0xe3); + + return !(vgacre3 & AST_DP_PHY_SLEEP); +} void ast_dp_power_on_off(struct drm_device *dev, bool on) { diff --git a/drivers/gpu/drm/ast/ast_drv.h b/drivers/gpu/drm/ast/ast_drv.h index ba3d86973995..47bab5596c16 100644 --- a/drivers/gpu/drm/ast/ast_drv.h +++ b/drivers/gpu/drm/ast/ast_drv.h @@ -472,6 +472,7 @@ void ast_init_3rdtx(struct drm_device *dev); bool ast_astdp_is_connected(struct ast_device *ast); int ast_astdp_read_edid(struct drm_device *dev, u8 *ediddata); void ast_dp_launch(struct drm_device *dev); +bool ast_dp_power_is_on(struct ast_device *ast); void ast_dp_power_on_off(struct drm_device *dev, bool no); void ast_dp_set_on_off(struct drm_device *dev, bool no); void ast_dp_set_mode(struct drm_crtc *crtc, struct ast_vbios_mode_info *vbios_mode); diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c index dc8f639e82fd..049ee1477c33 100644 --- a/drivers/gpu/drm/ast/ast_mode.c +++ b/drivers/gpu/drm/ast/ast_mode.c @@ -28,6 +28,7 @@ * Authors: Dave Airlie */ +#include #include #include @@ -1687,11 +1688,35 @@ static int ast_astdp_connector_helper_detect_ctx(struct drm_connector *connector struct drm_modeset_acquire_ctx *ctx, bool force) { + struct drm_device *dev = connector->dev; struct ast_device *ast = to_ast_device(connector->dev); + enum drm_connector_status status = connector_status_disconnected; + struct drm_connector_state *connector_state = connector->state; + bool is_active = false; + + mutex_lock(&ast->modeset_lock); + + if (connector_state && connector_state->crtc) { + struct drm_crtc_state *crtc_state = connector_state->crtc->state; + + if (crtc_state && crtc_state->active) + is_active = true; + } + + if (!is_active && !ast_dp_power_is_on(ast)) { + ast_dp_power_on_off(dev, true); + msleep(50); + } if (ast_astdp_is_connected(ast)) - return connector_status_connected; - return connector_status_disconnected; + status = connector_status_connected; + + if (!is_active && status == connector_status_disconnected) + ast_dp_power_on_off(dev, false); + + mutex_unlock(&ast->modeset_lock); + + return status; } static const struct drm_connector_helper_funcs ast_astdp_connector_helper_funcs = { From b6a66e521a2032f7fcba2af5a9bcbaeaa19b7ca3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:23 +0200 Subject: [PATCH 0282/1052] mptcp: sched: check both directions for backup The 'mptcp_subflow_context' structure has two items related to the backup flags: - 'backup': the subflow has been marked as backup by the other peer - 'request_bkup': the backup flag has been set by the host Before this patch, the scheduler was only looking at the 'backup' flag. That can make sense in some cases, but it looks like that's not what we wanted for the general use, because either the path-manager was setting both of them when sending an MP_PRIO, or the receiver was duplicating the 'backup' flag in the subflow request. Note that the use of these two flags in the path-manager are going to be fixed in the next commits, but this change here is needed not to modify the behaviour. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- include/trace/events/mptcp.h | 2 +- net/mptcp/protocol.c | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index 09e72215b9f9..085b749cdd97 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -34,7 +34,7 @@ TRACE_EVENT(mptcp_subflow_get_send, struct sock *ssk; __entry->active = mptcp_subflow_active(subflow); - __entry->backup = subflow->backup; + __entry->backup = subflow->backup || subflow->request_bkup; if (subflow->tcp_sock && sk_fullsock(subflow->tcp_sock)) __entry->free = sk_stream_memory_free(subflow->tcp_sock); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a26c2c840fd9..a2fc54ed68c0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1422,13 +1422,15 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } mptcp_for_each_subflow(msk, subflow) { + bool backup = subflow->backup || subflow->request_bkup; + trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); - nr_active += !subflow->backup; + nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ @@ -1439,9 +1441,9 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); - if (linger_time < send_info[subflow->backup].linger_time) { - send_info[subflow->backup].ssk = ssk; - send_info[subflow->backup].linger_time = linger_time; + if (linger_time < send_info[backup].linger_time) { + send_info[backup].ssk = ssk; + send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); From efd340bf3d7779a3a8ec954d8ec0fb8a10f24982 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:24 +0200 Subject: [PATCH 0283/1052] mptcp: distinguish rcv vs sent backup flag in requests When sending an MP_JOIN + SYN + ACK, it is possible to mark the subflow as 'backup' by setting the flag with the same name. Before this patch, the backup was set if the other peer set it in its MP_JOIN + SYN request. It is not correct: the backup flag should be set in the MPJ+SYN+ACK only if the host asks for it, and not mirroring what was done by the other peer. It is then required to have a dedicated bit for each direction, similar to what is done in the subflow context. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/options.c | 2 +- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8e8dcfbc2993..8a68382a4fe9 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -909,7 +909,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; - opts->backup = subflow_req->backup; + opts->backup = subflow_req->request_bkup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b11a4e50d52b..b8b25124e7de 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -448,6 +448,7 @@ struct mptcp_subflow_request_sock { u16 mp_capable : 1, mp_join : 1, backup : 1, + request_bkup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 39e2cbdf3801..a3778aee4e77 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2005,6 +2005,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->fully_established = 1; new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; + new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; From 4258b94831bb7ff28ab80e3c8d94db37db930728 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:25 +0200 Subject: [PATCH 0284/1052] mptcp: pm: only set request_bkup flag when sending MP_PRIO The 'backup' flag from mptcp_subflow_context structure is supposed to be set only when the other peer flagged a subflow as backup, not the opposite. Fixes: 067065422fcd ("mptcp: add the outgoing MP_PRIO support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f65831de5c1a..7635fac91539 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -471,7 +471,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con slow = lock_sock_fast(ssk); if (prio) { subflow->send_mp_prio = 1; - subflow->backup = backup; subflow->request_bkup = backup; } From 4dde0d72ccec500c60c798e036b852e013d6e124 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:26 +0200 Subject: [PATCH 0285/1052] mptcp: mib: count MPJ with backup flag Without such counters, it is difficult to easily debug issues with MPJ not having the backup flags on production servers. This is not strictly a fix, but it eases to validate the following patches without requiring to take packet traces, to query ongoing connections with Netlink with admin permissions, or to guess by looking at the behaviour of the packet scheduler. Also, the modification is self contained, isolated, well controlled, and the increments are done just after others, there from the beginning. It looks then safe, and helpful to backport this. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/mib.c | 2 ++ net/mptcp/mib.h | 2 ++ net/mptcp/subflow.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index c30405e76833..7884217f33eb 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -19,7 +19,9 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), + SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), + SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 2704afd0dfe4..66aa67f49d03 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -14,7 +14,9 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNBACKUPRX, /* Received a SYN + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNACKBACKUPRX, /* Received a SYN/ACK + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a3778aee4e77..be406197b1c4 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -168,6 +168,9 @@ static int subflow_check_req(struct request_sock *req, return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + + if (mp_opt.backup) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); } if (opt_mp_capable && listener->request_mptcp) { @@ -577,6 +580,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + if (subflow->backup) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); + if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d", ntohs(inet_sk(sk)->inet_dport), From 935ff5bb8a1cfcdf8e60c8f5c794d0bbbc234437 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:27 +0200 Subject: [PATCH 0286/1052] selftests: mptcp: join: validate backup in MPJ A peer can notify the other one that a subflow has to be treated as "backup" by two different ways: either by sending a dedicated MP_PRIO notification, or by setting the backup flag in the MP_JOIN handshake. The selftests were previously monitoring the former, but not the latter. This is what is now done here by looking at these new MIB counters when validating the 'backup' cases: MPTcpExtMPJoinSynBackupRx MPTcpExtMPJoinSynAckBackupRx The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it will help to validate a new fix for an issue introduced by this commit ID. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- .../testing/selftests/net/mptcp/mptcp_join.sh | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 55d84a1bde15..e6c8d86017f3 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1634,6 +1634,8 @@ chk_prio_nr() { local mp_prio_nr_tx=$1 local mp_prio_nr_rx=$2 + local mpj_syn=$3 + local mpj_syn_ack=$4 local count print_check "ptx" @@ -1655,6 +1657,26 @@ chk_prio_nr() else print_ok fi + + print_check "syn backup" + count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynBackupRx") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$mpj_syn" ]; then + fail_test "got $count JOIN[s] syn with Backup expected $mpj_syn" + else + print_ok + fi + + print_check "synack backup" + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckBackupRx") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$mpj_syn_ack" ]; then + fail_test "got $count JOIN[s] synack with Backup expected $mpj_syn_ack" + else + print_ok + fi } chk_subflow_nr() @@ -2612,7 +2634,7 @@ backup_tests() sflags=nobackup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 fi # single address, backup @@ -2625,7 +2647,7 @@ backup_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi # single address with port, backup @@ -2638,7 +2660,7 @@ backup_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi if reset "mpc backup" && @@ -2647,7 +2669,7 @@ backup_tests() speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 0 1 + chk_prio_nr 0 1 0 0 fi if reset "mpc backup both sides" && @@ -2657,7 +2679,7 @@ backup_tests() speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi if reset "mpc switch to backup" && @@ -2666,7 +2688,7 @@ backup_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 0 1 + chk_prio_nr 0 1 0 0 fi if reset "mpc switch to backup both sides" && @@ -2676,7 +2698,7 @@ backup_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi } @@ -3053,7 +3075,7 @@ fullmesh_tests() addr_nr_ns2=1 sflags=backup,fullmesh speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 chk_rm_nr 0 1 fi @@ -3066,7 +3088,7 @@ fullmesh_tests() sflags=nobackup,nofullmesh speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 chk_rm_nr 0 1 fi } @@ -3318,7 +3340,7 @@ userspace_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 0 - chk_prio_nr 0 0 + chk_prio_nr 0 0 0 0 fi # userspace pm type prevents rm_addr From 6834097fc38c5416701c793da94558cea49c0a1f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:28 +0200 Subject: [PATCH 0287/1052] mptcp: pm: fix backup support in signal endpoints There was a support for signal endpoints, but only when the endpoint's flag was changed during a connection. If an endpoint with the signal and backup was already present, the MP_JOIN reply was not containing the backup flag as expected. That's confusing to have this inconsistent behaviour. On the other hand, the infrastructure to set the backup flag in the SYN + ACK + MP_JOIN was already there, it was just never set before. Now when requesting the local ID from the path-manager, the backup status is also requested. Note that when the userspace PM is used, the backup flag can be set if the local address was already used before with a backup flag, e.g. if the address was announced with the 'backup' flag, or a subflow was created with the 'backup' flag. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/507 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm.c | 12 ++++++++++++ net/mptcp/pm_netlink.c | 18 ++++++++++++++++++ net/mptcp/pm_userspace.c | 18 ++++++++++++++++++ net/mptcp/protocol.h | 3 +++ net/mptcp/subflow.c | 3 +++ 5 files changed, 54 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 55406720c607..23bb89c94e90 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -426,6 +426,18 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, &skc_local); } +bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) +{ + struct mptcp_addr_info skc_local; + + mptcp_local_address((struct sock_common *)skc, &skc_local); + + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_is_backup(msk, &skc_local); + + return mptcp_pm_nl_is_backup(msk, &skc_local); +} + int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7635fac91539..37954a0b087d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1101,6 +1101,24 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc return ret; } +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + struct mptcp_pm_addr_entry *entry; + bool backup = false; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) { + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + break; + } + } + rcu_read_unlock(); + + return backup; +} + #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index f0a4590506c6..8eaa9fbe3e34 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -165,6 +165,24 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); } +bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, + struct mptcp_addr_info *skc) +{ + struct mptcp_pm_addr_entry *entry; + bool backup = false; + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, skc, false)) { + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + break; + } + } + spin_unlock_bh(&msk->pm.lock); + + return backup; +} + int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b8b25124e7de..60c6b073d65f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1109,6 +1109,9 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index be406197b1c4..0e4b5bfbeaa1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -100,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) return NULL; } subflow_req->local_id = local_id; + subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } @@ -620,6 +621,8 @@ static int subflow_chk_local_id(struct sock *sk) return err; subflow_set_local_id(subflow, err); + subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); + return 0; } From f833470c27832136d4416d8fc55d658082af0989 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:29 +0200 Subject: [PATCH 0288/1052] selftests: mptcp: join: check backup support in signal endp Before the previous commit, 'signal' endpoints with the 'backup' flag were ignored when sending the MP_JOIN. The MPTCP Join selftest has then been modified to validate this case: the "single address, backup" test, is now validating the MP_JOIN with a backup flag as it is what we expect it to do with such name. The previous version has been kept, but renamed to "single address, switch to backup" to avoid confusions. The "single address with port, backup" test is also now validating the MPJ with a backup flag, which makes more sense than checking the switch to backup with an MP_PRIO. The "mpc backup both sides" test is now validating that the backup flag is also set in MP_JOIN from and to the addresses used in the initial subflow, using the special ID 0. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- .../testing/selftests/net/mptcp/mptcp_join.sh | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e6c8d86017f3..4df48f1f14ab 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2639,6 +2639,19 @@ backup_tests() # single address, backup if reset "single address, backup" && + continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 0 1 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup + pm_nl_set_limits $ns2 1 1 + sflags=nobackup speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 + chk_prio_nr 1 0 0 1 + fi + + # single address, switch to backup + if reset "single address, switch to backup" && continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 1 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal @@ -2654,13 +2667,13 @@ backup_tests() if reset "single address with port, backup" && continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 1 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup port 10100 pm_nl_set_limits $ns2 1 1 - sflags=backup speed=slow \ + sflags=nobackup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 0 0 + chk_prio_nr 1 0 0 1 fi if reset "mpc backup" && @@ -2674,12 +2687,21 @@ backup_tests() if reset "mpc backup both sides" && continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then - pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow,backup + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 1 2 + pm_nl_add_endpoint $ns1 10.0.1.1 flags signal,backup pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup + + # 10.0.2.2 (non-backup) -> 10.0.1.1 (backup) + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow + # 10.0.1.2 (backup) -> 10.0.2.1 (non-backup) + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + ip -net "$ns2" route add 10.0.2.1 via 10.0.1.1 dev ns2eth1 # force this path + speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - chk_prio_nr 1 1 0 0 + chk_join_nr 2 2 2 + chk_prio_nr 1 1 1 1 fi if reset "mpc switch to backup" && From 2fe5273f149cc882c371f9954b5fdbd1bd8c9b5c Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 29 Jul 2024 11:40:15 +0800 Subject: [PATCH 0289/1052] net/smc: prevent UAF in inet_create() Following syzbot repro crashes the kernel: socketpair(0x2, 0x1, 0x100, &(0x7f0000000140)) (fail_nth: 13) Fix this by not calling sk_common_release() from smc_create_clcsk(). Stack trace: socket: no more sockets ------------[ cut here ]------------ refcount_t: underflow; use-after-free. WARNING: CPU: 1 PID: 5092 at lib/refcount.c:28 refcount_warn_saturate+0x15a/0x1d0 lib/refcount.c:28 Modules linked in: CPU: 1 PID: 5092 Comm: syz-executor424 Not tainted 6.10.0-syzkaller-04483-g0be9ae5486cd #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024 RIP: 0010:refcount_warn_saturate+0x15a/0x1d0 lib/refcount.c:28 Code: 80 f3 1f 8c e8 e7 69 a8 fc 90 0f 0b 90 90 eb 99 e8 cb 4f e6 fc c6 05 8a 8d e8 0a 01 90 48 c7 c7 e0 f3 1f 8c e8 c7 69 a8 fc 90 <0f> 0b 90 90 e9 76 ff ff ff e8 a8 4f e6 fc c6 05 64 8d e8 0a 01 90 RSP: 0018:ffffc900034cfcf0 EFLAGS: 00010246 RAX: 3b9fcde1c862f700 RBX: ffff888022918b80 RCX: ffff88807b39bc00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000003 R08: ffffffff815878a2 R09: fffffbfff1c39d94 R10: dffffc0000000000 R11: fffffbfff1c39d94 R12: 00000000ffffffe9 R13: 1ffff11004523165 R14: ffff888022918b28 R15: ffff888022918b00 FS: 00005555870e7380(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000140 CR3: 000000007582e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: inet_create+0xbaf/0xe70 __sock_create+0x490/0x920 net/socket.c:1571 sock_create net/socket.c:1622 [inline] __sys_socketpair+0x2ca/0x720 net/socket.c:1769 __do_sys_socketpair net/socket.c:1822 [inline] __se_sys_socketpair net/socket.c:1819 [inline] __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1819 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fbcb9259669 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 a1 1a 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fffe931c6d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000035 RAX: ffffffffffffffda RBX: 00007fffe931c6f0 RCX: 00007fbcb9259669 RDX: 0000000000000100 RSI: 0000000000000001 RDI: 0000000000000002 RBP: 0000000000000002 R08: 00007fffe931c476 R09: 00000000000000a0 R10: 0000000020000140 R11: 0000000000000246 R12: 00007fffe931c6ec R13: 431bde82d7b634db R14: 0000000000000001 R15: 0000000000000001 Link: https://lore.kernel.org/r/20240723175809.537291-1-edumazet@google.com/ Fixes: d25a92ccae6b ("net/smc: Introduce IPPROTO_SMC") Reported-by: syzbot Signed-off-by: D. Wythe Reviewed-by: Eric Dumazet Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/1722224415-30999-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 73a875573e7a..8e3093938cd2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3319,10 +3319,8 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); - if (rc) { - sk_common_release(sk); + if (rc) return rc; - } /* smc_clcsock_release() does not wait smc->clcsock->sk's * destruction; its sk_state might not be TCP_CLOSE after @@ -3368,6 +3366,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; else rc = smc_create_clcsk(net, sk, family); + + if (rc) + sk_common_release(sk); out: return rc; } From a371d558e6f3aed977a8a7346350557de5d25190 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 29 Jul 2024 14:19:28 -0400 Subject: [PATCH 0290/1052] mm, slub: do not call do_slab_free for kfence object In 782f8906f805 the freeing of kfence objects was moved from deep inside do_slab_free to the wrapper functions outside. This is a nice change, but unfortunately it missed one spot in __kmem_cache_free_bulk. This results in a crash like this: BUG skbuff_head_cache (Tainted: G S B E ): Padding overwritten. 0xffff88907fea0f00-0xffff88907fea0fff @offset=3840 slab_err (mm/slub.c:1129) free_to_partial_list (mm/slub.c:? mm/slub.c:4036) slab_pad_check (mm/slub.c:864 mm/slub.c:1290) check_slab (mm/slub.c:?) free_to_partial_list (mm/slub.c:3171 mm/slub.c:4036) kmem_cache_alloc_bulk (mm/slub.c:? mm/slub.c:4495 mm/slub.c:4586 mm/slub.c:4635) napi_build_skb (net/core/skbuff.c:348 net/core/skbuff.c:527 net/core/skbuff.c:549) All the other callers to do_slab_free appear to be ok. Add a kfence_free check in __kmem_cache_free_bulk to avoid the crash. Reported-by: Chris Mason Fixes: 782f8906f805 ("mm/slub: free KFENCE objects in slab_free_hook()") Cc: stable@kernel.org Signed-off-by: Rik van Riel Signed-off-by: Vlastimil Babka --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 3520acaf9afa..c9d8a2497fd6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4690,6 +4690,9 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.slab) continue; + if (kfence_free(df.freelist)) + continue; + do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); From df615907f1bf907260af01ccb904d0e9304b5278 Mon Sep 17 00:00:00 2001 From: Patryk Duda Date: Tue, 30 Jul 2024 10:44:25 +0000 Subject: [PATCH 0291/1052] platform/chrome: cros_ec_proto: Lock device when updating MKBP version The cros_ec_get_host_command_version_mask() function requires that the caller must have ec_dev->lock mutex before calling it. This requirement was not met and as a result it was possible that two commands were sent to the device at the same time. The problem was observed while using UART backend which doesn't use any additional locks, unlike SPI backend which locks the controller until response is received. Fixes: f74c7557ed0d ("platform/chrome: cros_ec_proto: Update version on GET_NEXT_EVENT failure") Cc: stable@vger.kernel.org Signed-off-by: Patryk Duda Link: https://lore.kernel.org/r/20240730104425.607083-1-patrykd@google.com Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_proto.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index f776fd42244f..73f75958e15c 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -813,9 +813,11 @@ int cros_ec_get_next_event(struct cros_ec_device *ec_dev, if (ret == -ENOPROTOOPT) { dev_dbg(ec_dev->dev, "GET_NEXT_EVENT returned invalid version error.\n"); + mutex_lock(&ec_dev->lock); ret = cros_ec_get_host_command_version_mask(ec_dev, EC_CMD_GET_NEXT_EVENT, &ver_mask); + mutex_unlock(&ec_dev->lock); if (ret < 0 || ver_mask == 0) /* * Do not change the MKBP supported version if we can't From 2c762be5b798c443612c1bb9b011de4fdaebd1c5 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Mon, 29 Jul 2024 19:03:33 -0400 Subject: [PATCH 0292/1052] io_uring: keep multishot request NAPI timeout current This refresh statement was originally present in the original patch: https://lore.kernel.org/netdev/20221121191437.996297-2-shr@devkernel.io/ It has been removed with no explanation in v6: https://lore.kernel.org/netdev/20230201222254.744422-2-shr@devkernel.io/ It is important to make the refresh for multishot requests, because if no new requests using the same NAPI device are added to the ring, the entry will become stale and be removed silently. The unsuspecting user will not know that their ring had busy polling for only 60 seconds before being pruned. Signed-off-by: Olivier Langlois Reviewed-by: Pavel Begunkov Fixes: 8d0c12a80cdeb ("io-uring: add napi busy poll support") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/0fe61a019ec61e5708cd117cb42ed0dab95e1617.1722294646.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/poll.c b/io_uring/poll.c index 0a8e02944689..1f63b60e85e7 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -347,6 +347,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) v &= IO_POLL_REF_MASK; } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); + io_napi_add(req); return IOU_POLL_NO_ACTION; } From c3fca4fb83f7c84cd1e1aa9fe3a0e220ce8f30fb Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Mon, 29 Jul 2024 19:13:35 -0400 Subject: [PATCH 0293/1052] io_uring: remove unused local list heads in NAPI functions These lists are unused, remove them. Signed-off-by: Olivier Langlois Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/0a0ae3e955aed0f3e3d29882fb3d3cb575e0009b.1722294947.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 4fd6bb331e1e..a3dc3762008f 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -205,7 +205,6 @@ void io_napi_init(struct io_ring_ctx *ctx) void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - LIST_HEAD(napi_list); unsigned int i; spin_lock(&ctx->napi_lock); @@ -315,7 +314,6 @@ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) */ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { - LIST_HEAD(napi_list); bool is_stale = false; if (!READ_ONCE(ctx->napi_busy_poll_dt)) From c3c4f22b7c814a6ee485ce294065836f8ede30fa Mon Sep 17 00:00:00 2001 From: Devyn Liu Date: Tue, 30 Jul 2024 11:20:39 +0800 Subject: [PATCH 0294/1052] spi: hisi-kunpeng: Add validation for the minimum value of speed_hz The speed specified by the user is used to calculate the clk_div based on the max_speed_hz in hisi_calc_effective_speed. A very low speed value can lead to a clk_div larger than the variable range. Avoid this by setting the min_speed_hz so that such a small speed value is rejected. __spi_validate() in spi.c will return -EINVAL for the specified speed_hz lower than min_speed_hz. Signed-off-by: Devyn Liu Reviewed-by: Jay Fang Link: https://patch.msgid.link/20240730032040.3156393-2-liudingyuan@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-hisi-kunpeng.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spi-hisi-kunpeng.c b/drivers/spi/spi-hisi-kunpeng.c index 77e9738e42f6..6910b4d4c427 100644 --- a/drivers/spi/spi-hisi-kunpeng.c +++ b/drivers/spi/spi-hisi-kunpeng.c @@ -495,6 +495,7 @@ static int hisi_spi_probe(struct platform_device *pdev) host->transfer_one = hisi_spi_transfer_one; host->handle_err = hisi_spi_handle_err; host->dev.fwnode = dev->fwnode; + host->min_speed_hz = DIV_ROUND_UP(host->max_speed_hz, CLK_DIV_MAX); hisi_spi_hw_init(hs); From 5127c42c77de18651aa9e8e0a3ced190103b449c Mon Sep 17 00:00:00 2001 From: Devyn Liu Date: Tue, 30 Jul 2024 11:20:40 +0800 Subject: [PATCH 0295/1052] spi: hisi-kunpeng: Add verification for the max_frequency provided by the firmware If the value of max_speed_hz is 0, it may cause a division by zero error in hisi_calc_effective_speed(). The value of max_speed_hz is provided by firmware. Firmware is generally considered as a trusted domain. However, as division by zero errors can cause system failure, for defense measure, the value of max_speed is validated here. So 0 is regarded as invalid and an error code is returned. Signed-off-by: Devyn Liu Reviewed-by: Jay Fang Link: https://patch.msgid.link/20240730032040.3156393-3-liudingyuan@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-hisi-kunpeng.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/spi-hisi-kunpeng.c b/drivers/spi/spi-hisi-kunpeng.c index 6910b4d4c427..16054695bdb0 100644 --- a/drivers/spi/spi-hisi-kunpeng.c +++ b/drivers/spi/spi-hisi-kunpeng.c @@ -481,6 +481,9 @@ static int hisi_spi_probe(struct platform_device *pdev) return -EINVAL; } + if (host->max_speed_hz == 0) + return dev_err_probe(dev, -EINVAL, "spi-max-frequency can't be 0\n"); + ret = device_property_read_u16(dev, "num-cs", &host->num_chipselect); if (ret) From e075c3b13a0a142dcd3151b25d29a24f31b7b640 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 29 Jul 2024 14:04:43 +0200 Subject: [PATCH 0296/1052] platform/x86: intel-vbtn: Protect ACPI notify handler against recursion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit e2ffcda16290 ("ACPI: OSL: Allow Notify () handlers to run on all CPUs") ACPI notify handlers like the intel-vbtn notify_handler() may run on multiple CPU cores racing with themselves. This race gets hit on Dell Venue 7140 tablets when undocking from the keyboard, causing the handler to try and register priv->switches_dev twice, as can be seen from the dev_info() message getting logged twice: [ 83.861800] intel-vbtn INT33D6:00: Registering Intel Virtual Switches input-dev after receiving a switch event [ 83.861858] input: Intel Virtual Switches as /devices/pci0000:00/0000:00:1f.0/PNP0C09:00/INT33D6:00/input/input17 [ 83.861865] intel-vbtn INT33D6:00: Registering Intel Virtual Switches input-dev after receiving a switch event After which things go seriously wrong: [ 83.861872] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:1f.0/PNP0C09:00/INT33D6:00/input/input17' ... [ 83.861967] kobject: kobject_add_internal failed for input17 with -EEXIST, don't try to register things with the same name in the same directory. [ 83.877338] BUG: kernel NULL pointer dereference, address: 0000000000000018 ... Protect intel-vbtn notify_handler() from racing with itself with a mutex to fix this. Fixes: e2ffcda16290 ("ACPI: OSL: Allow Notify () handlers to run on all CPUs") Reported-by: En-Wei Wu Closes: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2073001 Tested-by: Kostadin Stoilov Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240729120443.14779-1-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vbtn.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 9b7ce03ba085..a353e830b65f 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -7,11 +7,13 @@ */ #include +#include #include #include #include #include #include +#include #include #include #include "../dual_accel_detect.h" @@ -66,6 +68,7 @@ static const struct key_entry intel_vbtn_switchmap[] = { }; struct intel_vbtn_priv { + struct mutex mutex; /* Avoid notify_handler() racing with itself */ struct input_dev *buttons_dev; struct input_dev *switches_dev; bool dual_accel; @@ -155,6 +158,8 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) bool autorelease; int ret; + guard(mutex)(&priv->mutex); + if ((ke = sparse_keymap_entry_from_scancode(priv->buttons_dev, event))) { if (!priv->has_buttons) { dev_warn(&device->dev, "Warning: received 0x%02x button event on a device without buttons, please report this.\n", @@ -290,6 +295,10 @@ static int intel_vbtn_probe(struct platform_device *device) return -ENOMEM; dev_set_drvdata(&device->dev, priv); + err = devm_mutex_init(&device->dev, &priv->mutex); + if (err) + return err; + priv->dual_accel = dual_accel; priv->has_buttons = has_buttons; priv->has_switches = has_switches; From 4c83ee4bf32ea8e57ae2321906c067d69ad7c41b Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Mon, 29 Jul 2024 14:08:31 +1200 Subject: [PATCH 0297/1052] platform/x86/amd: pmf: Add quirk for ROG Ally X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ASUS ROG Ally X has the same issue as the G14 where it advertises SPS support but doesn't use it. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20240729020831.28117-1-luke@ljones.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/pmf-quirks.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/pmf-quirks.c b/drivers/platform/x86/amd/pmf/pmf-quirks.c index 0b2eb0ae85fe..460444cda1b2 100644 --- a/drivers/platform/x86/amd/pmf/pmf-quirks.c +++ b/drivers/platform/x86/amd/pmf/pmf-quirks.c @@ -29,6 +29,14 @@ static const struct dmi_system_id fwbug_list[] = { }, .driver_data = &quirk_no_sps_bug, }, + { + .ident = "ROG Ally X", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "RC72LA"), + }, + .driver_data = &quirk_no_sps_bug, + }, {} }; @@ -48,4 +56,3 @@ void amd_pmf_quirks_init(struct amd_pmf_dev *dev) dmi_id->ident); } } - From 426463d94d45d37c233e480231a40b9b35f10e49 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 18 Jul 2024 20:31:19 +0530 Subject: [PATCH 0298/1052] platform/x86/amd/pmc: Send OS_HINT command for new AMD platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To initiate the HW deep state transition, the OS_HINT command has to be sent to the PMFW. Add this support to the platforms that belong to family 1Ah model 60h series. Signed-off-by: Shyam Sundar S K Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240718150119.3427190-1-Shyam-sundar.S-k@amd.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 2 ++ drivers/platform/x86/amd/pmc/pmc.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index a3d881f6e5d9..c3e51f0a5c33 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -764,6 +764,7 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev) case AMD_CPU_ID_CB: case AMD_CPU_ID_PS: case PCI_DEVICE_ID_AMD_1AH_M20H_ROOT: + case PCI_DEVICE_ID_AMD_1AH_M60H_ROOT: return MSG_OS_HINT_RN; } return -EINVAL; @@ -967,6 +968,7 @@ static const struct pci_device_id pmc_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_RV) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_SP) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) }, { } }; diff --git a/drivers/platform/x86/amd/pmc/pmc.h b/drivers/platform/x86/amd/pmc/pmc.h index 9e32d3128c3a..f1166d15c856 100644 --- a/drivers/platform/x86/amd/pmc/pmc.h +++ b/drivers/platform/x86/amd/pmc/pmc.h @@ -67,6 +67,7 @@ void amd_mp2_stb_deinit(struct amd_pmc_dev *dev); #define AMD_CPU_ID_PS 0x14E8 #define AMD_CPU_ID_SP 0x14A4 #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 +#define PCI_DEVICE_ID_AMD_1AH_M60H_ROOT 0x1122 #define PCI_DEVICE_ID_AMD_MP2_STB 0x172c #endif /* PMC_H */ From 942810c0e89277d738b7f1b6f379d0a5877999f6 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 23 Jul 2024 18:54:50 +0530 Subject: [PATCH 0299/1052] platform/x86/amd/pmf: Add new ACPI ID AMDI0107 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new ACPI ID AMDI0107 used by upcoming AMD platform to the PMF supported list of devices. Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20240723132451.3488326-1-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 2d6e2558863c..8f1f719befa3 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -41,6 +41,7 @@ #define AMD_CPU_ID_RMB 0x14b5 #define AMD_CPU_ID_PS 0x14e8 #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 +#define PCI_DEVICE_ID_AMD_1AH_M60H_ROOT 0x1122 #define PMF_MSG_DELAY_MIN_US 50 #define RESPONSE_REGISTER_LOOP_MAX 20000 @@ -249,6 +250,7 @@ static const struct pci_device_id pmf_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_RMB) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_PS) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) }, { } }; @@ -382,6 +384,7 @@ static const struct acpi_device_id amd_pmf_acpi_ids[] = { {"AMDI0102", 0}, {"AMDI0103", 0}, {"AMDI0105", 0}, + {"AMDI0107", 0}, { } }; MODULE_DEVICE_TABLE(acpi, amd_pmf_acpi_ids); From f558120cd709682b739207b48cf7479fd9568431 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 29 Jul 2024 14:28:16 +0200 Subject: [PATCH 0300/1052] net/iucv: fix use after free in iucv_sock_close() iucv_sever_path() is called from process context and from bh context. iucv->path is used as indicator whether somebody else is taking care of severing the path (or it is already removed / never existed). This needs to be done with atomic compare and swap, otherwise there is a small window where iucv_sock_close() will try to work with a path that has already been severed and freed by iucv_callback_connrej() called by iucv_tasklet_fn(). Example: [452744.123844] Call Trace: [452744.123845] ([<0000001e87f03880>] 0x1e87f03880) [452744.123966] [<00000000d593001e>] iucv_path_sever+0x96/0x138 [452744.124330] [<000003ff801ddbca>] iucv_sever_path+0xc2/0xd0 [af_iucv] [452744.124336] [<000003ff801e01b6>] iucv_sock_close+0xa6/0x310 [af_iucv] [452744.124341] [<000003ff801e08cc>] iucv_sock_release+0x3c/0xd0 [af_iucv] [452744.124345] [<00000000d574794e>] __sock_release+0x5e/0xe8 [452744.124815] [<00000000d5747a0c>] sock_close+0x34/0x48 [452744.124820] [<00000000d5421642>] __fput+0xba/0x268 [452744.124826] [<00000000d51b382c>] task_work_run+0xbc/0xf0 [452744.124832] [<00000000d5145710>] do_notify_resume+0x88/0x90 [452744.124841] [<00000000d5978096>] system_call+0xe2/0x2c8 [452744.125319] Last Breaking-Event-Address: [452744.125321] [<00000000d5930018>] iucv_path_sever+0x90/0x138 [452744.125324] [452744.125325] Kernel panic - not syncing: Fatal exception in interrupt Note that bh_lock_sock() is not serializing the tasklet context against process context, because the check for sock_owned_by_user() and corresponding handling is missing. Ideas for a future clean-up patch: A) Correct usage of bh_lock_sock() in tasklet context, as described in Link: https://lore.kernel.org/netdev/1280155406.2899.407.camel@edumazet-laptop/ Re-enqueue, if needed. This may require adding return values to the tasklet functions and thus changes to all users of iucv. B) Change iucv tasklet into worker and use only lock_sock() in af_iucv. Fixes: 7d316b945352 ("af_iucv: remove IUCV-pathes completely") Reviewed-by: Halil Pasic Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20240729122818.947756-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- net/iucv/af_iucv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index c3b0b610b0aa..c00323fa9eb6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -335,8 +335,8 @@ static void iucv_sever_path(struct sock *sk, int with_user_data) struct iucv_sock *iucv = iucv_sk(sk); struct iucv_path *path = iucv->path; - if (iucv->path) { - iucv->path = NULL; + /* Whoever resets the path pointer, must sever and free it. */ + if (xchg(&iucv->path, NULL)) { if (with_user_data) { low_nmcpy(user_data, iucv->src_name); high_nmcpy(user_data, iucv->dst_name); From bf5641eccf71bcd13a849930e190563c3a19815d Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Mon, 29 Jul 2024 08:46:26 +0200 Subject: [PATCH 0301/1052] x86/CPU/AMD: Add models 0x60-0x6f to the Zen5 range Add some new Zen5 models for the 0x1A family. [ bp: Merge the 0x60 and 0x70 ranges. ] Signed-off-by: Perry Yuan Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240729064626.24297-1-bp@kernel.org --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index be5889bded49..1e0fe5f8ab84 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -462,7 +462,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x00 ... 0x2f: case 0x40 ... 0x4f: - case 0x70 ... 0x7f: + case 0x60 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); break; default: From b8e947e9f64cac9df85a07672b658df5b2bcff07 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 29 Jul 2024 21:59:24 +0200 Subject: [PATCH 0302/1052] btrfs: initialize location to fix -Wmaybe-uninitialized in btrfs_lookup_dentry() Some arch + compiler combinations report a potentially unused variable location in btrfs_lookup_dentry(). This is a false alert as the variable is passed by value and always valid or there's an error. The compilers cannot probably reason about that although btrfs_inode_by_name() is in the same file. > + /kisskb/src/fs/btrfs/inode.c: error: 'location.objectid' may be used +uninitialized in this function [-Werror=maybe-uninitialized]: => 5603:9 > + /kisskb/src/fs/btrfs/inode.c: error: 'location.type' may be used +uninitialized in this function [-Werror=maybe-uninitialized]: => 5674:5 m68k-gcc8/m68k-allmodconfig mips-gcc8/mips-allmodconfig powerpc-gcc5/powerpc-all{mod,yes}config powerpc-gcc5/ppc64_defconfig Initialize it to zero, this should fix the warnings and won't change the behaviour as btrfs_inode_by_name() accepts only a root or inode item types, otherwise returns an error. Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/linux-btrfs/bd4e9928-17b3-9257-8ba7-6b7f9bbb639a@linux-m68k.org/ Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ca3878348ff..1d4e0a65494a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5664,7 +5664,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; - struct btrfs_key location; + struct btrfs_key location = { 0 }; u8 di_type = 0; int ret = 0; From 555069117390a5d581863bc797fb546bb4417c31 Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Tue, 30 Jul 2024 09:25:05 +0530 Subject: [PATCH 0303/1052] drm/i915/hdcp: Fix HDCP2_STREAM_STATUS macro Fix HDCP2_STREAM_STATUS macro, it called pipe instead of port never threw a compile error as no one used it. --v2 -Add Fixes [Jani] Fixes: d631b984cc90 ("drm/i915/hdcp: Add HDCP 2.2 stream register") Signed-off-by: Suraj Kandpal Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240730035505.3759899-1-suraj.kandpal@intel.com (cherry picked from commit 73d7cd542bbd0a7c6881ea0df5255f190a1e7236) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_hdcp_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_hdcp_regs.h b/drivers/gpu/drm/i915/display/intel_hdcp_regs.h index a568a457e532..f590d7f48ba7 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp_regs.h +++ b/drivers/gpu/drm/i915/display/intel_hdcp_regs.h @@ -251,7 +251,7 @@ #define HDCP2_STREAM_STATUS(dev_priv, trans, port) \ (TRANS_HDCP(dev_priv) ? \ TRANS_HDCP2_STREAM_STATUS(trans) : \ - PIPE_HDCP2_STREAM_STATUS(pipe)) + PIPE_HDCP2_STREAM_STATUS(port)) #define _PORTA_HDCP2_AUTH_STREAM 0x66F00 #define _PORTB_HDCP2_AUTH_STREAM 0x66F04 From 5b511572660190db1dc8ba412efd0be0d3781ab6 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 29 Jul 2024 10:40:35 -0700 Subject: [PATCH 0304/1052] drm/i915: Fix possible int overflow in skl_ddi_calculate_wrpll() On the off chance that clock value ends up being too high (by means of skl_ddi_calculate_wrpll() having been called with big enough value of crtc_state->port_clock * 1000), one possible consequence may be that the result will not be able to fit into signed int. Fix this issue by moving conversion of clock parameter from kHz to Hz into the body of skl_ddi_calculate_wrpll(), as well as casting the same parameter to u64 type while calculating the value for AFE clock. This both mitigates the overflow problem and avoids possible erroneous integer promotion mishaps. Found by Linux Verification Center (linuxtesting.org) with static analysis tool SVACE. Fixes: 82d354370189 ("drm/i915/skl: Implementation of SKL DPLL programming") Cc: stable@vger.kernel.org Signed-off-by: Nikita Zhandarovich Reviewed-by: Jani Nikula Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240729174035.25727-1-n.zhandarovich@fintech.ru (cherry picked from commit 833cf12846aa19adf9b76bc79c40747726f3c0c1) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_dpll_mgr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dpll_mgr.c b/drivers/gpu/drm/i915/display/intel_dpll_mgr.c index 90998b037349..292d163036b1 100644 --- a/drivers/gpu/drm/i915/display/intel_dpll_mgr.c +++ b/drivers/gpu/drm/i915/display/intel_dpll_mgr.c @@ -1658,7 +1658,7 @@ static void skl_wrpll_params_populate(struct skl_wrpll_params *params, } static int -skl_ddi_calculate_wrpll(int clock /* in Hz */, +skl_ddi_calculate_wrpll(int clock, int ref_clock, struct skl_wrpll_params *wrpll_params) { @@ -1683,7 +1683,7 @@ skl_ddi_calculate_wrpll(int clock /* in Hz */, }; unsigned int dco, d, i; unsigned int p0, p1, p2; - u64 afe_clock = clock * 5; /* AFE Clock is 5x Pixel clock */ + u64 afe_clock = (u64)clock * 1000 * 5; /* AFE Clock is 5x Pixel clock, in Hz */ for (d = 0; d < ARRAY_SIZE(dividers); d++) { for (dco = 0; dco < ARRAY_SIZE(dco_central_freq); dco++) { @@ -1808,7 +1808,7 @@ static int skl_ddi_hdmi_pll_dividers(struct intel_crtc_state *crtc_state) struct skl_wrpll_params wrpll_params = {}; int ret; - ret = skl_ddi_calculate_wrpll(crtc_state->port_clock * 1000, + ret = skl_ddi_calculate_wrpll(crtc_state->port_clock, i915->display.dpll.ref_clks.nssc, &wrpll_params); if (ret) return ret; From b630a04121519c0f93f9cc8994147191cc3c17b4 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 23 Jul 2024 07:02:26 -0700 Subject: [PATCH 0305/1052] thermal: intel: int340x: Fix kernel warning during MSI cleanup On some pre-production Lunar Lake systems, there is a kernel warning: remove_proc_entry: removing non-empty directory 'irq/172' WARNING: CPU: 0 PID: 501 at fs/proc/generic.c:717 remove_proc_entry+0x1b4/0x1e0 ... ... remove_proc_entry+0x1b4/0x1e0 report_bug+0x182/0x1b0 handle_bug+0x51/0xa0 exc_invalid_op+0x18/0x80 asm_exc_invalid_op+0x1b/0x20 remove_proc_entry+0x1b4/0x1e0 remove_proc_entry+0x1b4/0x1e0 unregister_irq_proc+0xf2/0x120 free_desc+0x41/0xe0 irq_domain_free_irqs+0x138/0x1c0 irq_free_descs+0x52/0x80 irq_domain_free_irqs+0x151/0x1c0 msi_domain_free_locked.part.0+0x17e/0x1c0 msi_domain_free_irqs_all_locked+0x74/0xc0 pci_msi_teardown_msi_irqs+0x50/0x60 pci_free_msi_irqs+0x12/0x40 pci_free_irq_vectors+0x58/0x70 On these systems, not all the MSI thermal vectors are valid. This causes devm_request_threaded_irq() to fail for some vectors. As part of the clean up on this error, pci_free_irq_vectors() is called without calling devm_free_irq(). This causes the above warning. Add a function proc_thermal_free_msi() to call devm_free_irq() for all successfully registered IRQ handlers, then call pci_free_irq_vectors(). Call this function for MSI cleanup. Fixes: 7a9a8c5faf41 ("thermal: intel: int340x: Support MSI interrupt for Lunar Lake") Reported-by: Yijun Shen Tested-by: Yijun Shen Signed-off-by: Srinivas Pandruvada Reviewed-by: Zhang Rui Link: https://patch.msgid.link/20240723140228.865919-2-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- .../processor_thermal_device_pci.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 114136893a59..2c9c45eb5b4a 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -278,6 +278,18 @@ static struct thermal_zone_params tzone_params = { static bool msi_irq; +static void proc_thermal_free_msi(struct pci_dev *pdev, struct proc_thermal_pci *pci_info) +{ + int i; + + for (i = 0; i < MSI_THERMAL_MAX; i++) { + if (proc_thermal_msi_map[i]) + devm_free_irq(&pdev->dev, proc_thermal_msi_map[i], pci_info); + } + + pci_free_irq_vectors(pdev); +} + static int proc_thermal_setup_msi(struct pci_dev *pdev, struct proc_thermal_pci *pci_info) { int ret, i, irq; @@ -310,7 +322,7 @@ static int proc_thermal_setup_msi(struct pci_dev *pdev, struct proc_thermal_pci return 0; err_free_msi_vectors: - pci_free_irq_vectors(pdev); + proc_thermal_free_msi(pdev, pci_info); return ret; } @@ -397,7 +409,7 @@ static int proc_thermal_pci_probe(struct pci_dev *pdev, const struct pci_device_ err_free_vectors: if (msi_irq) - pci_free_irq_vectors(pdev); + proc_thermal_free_msi(pdev, pci_info); err_ret_tzone: thermal_zone_device_unregister(pci_info->tzone); err_del_legacy: From b85a2d300a37641456c238591a056404436675c2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 23 Jul 2024 07:02:27 -0700 Subject: [PATCH 0306/1052] thermal: intel: int340x: Allow limited thermal MSI support On some Lunar Lake pre-production systems, not all the MSI thermal vectors are valid. In that case instead of failing module load, continue with partial thermal interrupt support. pci_alloc_irq_vectors() can return less than expected maximum vectors. In that case call devm_request_threaded_irq() only for current maximum vectors. Fixes: 7a9a8c5faf41 ("thermal: intel: int340x: Support MSI interrupt for Lunar Lake") Reported-by: Yijun Shen Tested-by: Yijun Shen Signed-off-by: Srinivas Pandruvada Reviewed-by: Zhang Rui Link: https://patch.msgid.link/20240723140228.865919-3-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- .../int340x_thermal/processor_thermal_device_pci.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 2c9c45eb5b4a..ff296626c498 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -292,18 +292,18 @@ static void proc_thermal_free_msi(struct pci_dev *pdev, struct proc_thermal_pci static int proc_thermal_setup_msi(struct pci_dev *pdev, struct proc_thermal_pci *pci_info) { - int ret, i, irq; + int ret, i, irq, count; - ret = pci_alloc_irq_vectors(pdev, 1, MSI_THERMAL_MAX, PCI_IRQ_MSI | PCI_IRQ_MSIX); - if (ret < 0) { + count = pci_alloc_irq_vectors(pdev, 1, MSI_THERMAL_MAX, PCI_IRQ_MSI | PCI_IRQ_MSIX); + if (count < 0) { dev_err(&pdev->dev, "Failed to allocate vectors!\n"); - return ret; + return count; } dev_info(&pdev->dev, "msi enabled:%d msix enabled:%d\n", pdev->msi_enabled, pdev->msix_enabled); - for (i = 0; i < MSI_THERMAL_MAX; i++) { + for (i = 0; i < count; i++) { irq = pci_irq_vector(pdev, i); ret = devm_request_threaded_irq(&pdev->dev, irq, proc_thermal_irq_handler, From f8ce49be2743541bdc9e184fab1cfd736031ab66 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 23 Jul 2024 07:02:28 -0700 Subject: [PATCH 0307/1052] thermal: intel: int340x: Free MSI IRQ vectors on module exit On module exit call proc_thermal_free_msi() to free vectors allocated by pci_alloc_irq_vectors(). Fixes: 7a9a8c5faf41 ("thermal: intel: int340x: Support MSI interrupt for Lunar Lake") Signed-off-by: Srinivas Pandruvada Reviewed-by: Zhang Rui Link: https://patch.msgid.link/20240723140228.865919-4-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/processor_thermal_device_pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index ff296626c498..006614921870 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -431,6 +431,9 @@ static void proc_thermal_pci_remove(struct pci_dev *pdev) proc_thermal_mmio_write(pci_info, PROC_THERMAL_MMIO_THRES_0, 0); proc_thermal_mmio_write(pci_info, PROC_THERMAL_MMIO_INT_ENABLE_0, 0); + if (msi_irq) + proc_thermal_free_msi(pdev, pci_info); + thermal_zone_device_unregister(pci_info->tzone); proc_thermal_mmio_remove(pdev, pci_info->proc_priv); if (!pci_info->no_legacy) From 0aa3ca956c46d849775eae1816cef8fe4bc8b50e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Jul 2024 11:06:56 -0500 Subject: [PATCH 0308/1052] net: mvpp2: Don't re-use loop iterator This function has a nested loop. The problem is that both the inside and outside loop use the same variable as an iterator. I found this via static analysis so I'm not sure the impact. It could be that it loops forever or, more likely, the loop exits early. Fixes: 3a616b92a9d1 ("net: mvpp2: Add TX flow control support for jumbo frames") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/eaa8f403-7779-4d81-973d-a9ecddc0bf6f@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 8c45ad983abc..0d62a33afa80 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -953,13 +953,13 @@ static void mvpp2_bm_pool_update_fc(struct mvpp2_port *port, static void mvpp2_bm_pool_update_priv_fc(struct mvpp2 *priv, bool en) { struct mvpp2_port *port; - int i; + int i, j; for (i = 0; i < priv->port_count; i++) { port = priv->port_list[i]; if (port->priv->percpu_pools) { - for (i = 0; i < port->nrxqs; i++) - mvpp2_bm_pool_update_fc(port, &port->priv->bm_pools[i], + for (j = 0; j < port->nrxqs; j++) + mvpp2_bm_pool_update_fc(port, &port->priv->bm_pools[j], port->tx_fc & en); } else { mvpp2_bm_pool_update_fc(port, port->pool_long, port->tx_fc & en); From e4c4638b6a10427d30e29d22351c375886025f47 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 30 Jul 2024 15:35:47 +0200 Subject: [PATCH 0309/1052] spi: spidev: Add missing spi_device_id for bh2228fv When the of_device_id entry for "rohm,bh2228fv" was added, the corresponding spi_device_id was forgotten, causing a warning message during boot-up: SPI driver spidev has no spi_device_id for rohm,bh2228fv Fix module autoloading and shut up the warning by adding the missing entry. Fixes: fc28d1c1fe3b3e2f ("spi: spidev: add correct compatible for Rohm BH2228FV") Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/cb571d4128f41175f31319cd9febc829417ea167.1722346539.git.geert+renesas@glider.be Signed-off-by: Mark Brown --- drivers/spi/spidev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index 05e6d007f9a7..5304728c68c2 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -700,6 +700,7 @@ static const struct class spidev_class = { }; static const struct spi_device_id spidev_spi_ids[] = { + { .name = "bh2228fv" }, { .name = "dh2228fv" }, { .name = "ltc2488" }, { .name = "sx1301" }, From 22f5468731491e53356ba7c028f0fdea20b18e2c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 30 Jul 2024 10:36:47 -0700 Subject: [PATCH 0310/1052] minmax: improve macro expansion and type checking This clarifies the rules for min()/max()/clamp() type checking and makes them a much more efficient macro expansion. In particular, we now look at the type and range of the inputs to see whether they work together, generating a mask of acceptable comparisons, and then just verifying that the inputs have a shared case: - an expression with a signed type can be used for (1) signed comparisons (2) unsigned comparisons if it is statically known to have a non-negative value - an expression with an unsigned type can be used for (3) unsigned comparison (4) signed comparisons if the type is smaller than 'int' and thus the C integer promotion rules will make it signed anyway Here rule (1) and (3) are obvious, and rule (2) is important in order to allow obvious trivial constants to be used together with unsigned values. Rule (4) is not necessarily a good idea, but matches what we used to do, and we have extant cases of this situation in the kernel. Notably with bcachefs having an expression like min(bch2_bucket_sectors_dirty(a), ca->mi.bucket_size) where bch2_bucket_sectors_dirty() returns an 's64', and 'ca->mi.bucket_size' is of type 'u16'. Technically that bcachefs comparison is clearly sensible on a C type level, because the 'u16' will go through the normal C integer promotion, and become 'int', and then we're comparing two signed values and everything looks sane. However, it's not entirely clear that a 'min(s64,u16)' operation makes a lot of conceptual sense, and it's possible that we will remove rule (4). After all, the _reason_ we have these complicated type checks is exactly that the C type promotion rules are not very intuitive. But at least for now the rule is in place for backwards compatibility. Also note that rule (2) existed before, but is hugely relaxed by this commit. It used to be true only for the simplest compile-time non-negative integer constants. The new macro model will allow cases where the compiler can trivially see that an expression is non-negative even if it isn't necessarily a constant. For example, the amdgpu driver does min_t(size_t, sizeof(fru_info->serial), pia[addr] & 0x3F)); because our old 'min()' macro would see that 'pia[addr] & 0x3F' is of type 'int' and clearly not a C constant expression, so doing a 'min()' with a 'size_t' is a signedness violation. Our new 'min()' macro still sees that 'pia[addr] & 0x3F' is of type 'int', but is smart enough to also see that it is clearly non-negative, and thus would allow that case without any complaints. Cc: Arnd Bergmann Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 9 +++++ include/linux/minmax.h | 74 ++++++++++++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2594553bb30b..2df665fa2964 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -296,6 +296,15 @@ static inline void *offset_to_ptr(const int *off) #define is_signed_type(type) (((type)(-1)) < (__force type)1) #define is_unsigned_type(type) (!is_signed_type(type)) +/* + * Useful shorthand for "is this condition known at compile-time?" + * + * Note that the condition may involve non-constant values, + * but the compiler may know enough about the details of the + * values to determine that the condition is statically true. + */ +#define statically_true(x) (__builtin_constant_p(x) && (x)) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. diff --git a/include/linux/minmax.h b/include/linux/minmax.h index e3e4353df983..41da6f85a407 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -26,19 +26,63 @@ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) -/* is_signed_type() isn't a constexpr for pointer types */ -#define __is_signed(x) \ - __builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))), \ - is_signed_type(typeof(x)), 0) +/* + * __sign_use for integer expressions: + * bit #0 set if ok for unsigned comparisons + * bit #1 set if ok for signed comparisons + * + * In particular, statically non-negative signed integer + * expressions are ok for both. + * + * NOTE! Unsigned types smaller than 'int' are implicitly + * converted to 'int' in expressions, and are accepted for + * signed conversions for now. This is debatable. + * + * Note that 'x' is the original expression, and 'ux' is + * the unique variable that contains the value. + * + * We use 'ux' for pure type checking, and 'x' for when + * we need to look at the value (but without evaluating + * it for side effects! Careful to only ever evaluate it + * with sizeof() or __builtin_constant_p() etc). + * + * Pointers end up being checked by the normal C type + * rules at the actual comparison, and these expressions + * only need to be careful to not cause warnings for + * pointer use. + */ +#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux)) +#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4)) +#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \ + __signed_type_use(x,ux):__unsigned_type_use(x,ux)) -/* True for a non-negative signed int constant */ -#define __is_noneg_int(x) \ - (__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0) +/* + * To avoid warnings about casting pointers to integers + * of different sizes, we need that special sign type. + * + * On 64-bit we can just always use 'long', since any + * integer or pointer type can just be cast to that. + * + * This does not work for 128-bit signed integers since + * the cast would truncate them, but we do not use s128 + * types in the kernel (we do use 'u128', but they will + * be handled by the !is_signed_type() case). + * + * NOTE! The cast is there only to avoid any warnings + * from when values that aren't signed integer types. + */ +#ifdef CONFIG_64BIT + #define __signed_type(ux) long +#else + #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L)) +#endif +#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0) -#define __types_ok(x, y, ux, uy) \ - (__is_signed(ux) == __is_signed(uy) || \ - __is_signed((ux) + 0) == __is_signed((uy) + 0) || \ - __is_noneg_int(x) || __is_noneg_int(y)) +#define __types_ok(x,y,ux,uy) \ + (__sign_use(x,ux) & __sign_use(y,uy)) + +#define __types_ok3(x,y,z,ux,uy,uz) \ + (__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -53,8 +97,8 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - static_assert(__types_ok(x, y, ux, uy), \ - #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ + BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ + #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) #define __careful_cmp(op, x, y) \ @@ -70,8 +114,8 @@ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error"); \ - static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error"); \ + BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi), \ + "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) #define __careful_clamp(val, lo, hi) \ From 6be6cba9c4371d27f78d900ccfe34bb880d9ee20 Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Tue, 30 Jul 2024 09:44:00 +0800 Subject: [PATCH 0311/1052] irqchip/mbigen: Fix mbigen node address layout The mbigen interrupt chip has its per node registers located in a contiguous region of page sized chunks. The code maps them into virtual address space as a contiguous region and determines the address of a node by using the node ID as index. mbigen chip |-----------------|------------|--------------| mgn_node_0 mgn_node_1 ... mgn_node_i |--------------| |--------------| |----------------------| [0x0000, 0x0x0FFF] [0x1000, 0x1FFF] [i*0x1000, (i+1)*0x1000 - 1] This works correctly up to 10 nodes, but then fails because the 11th's array slot is used for the MGN_CLEAR registers. mbigen chip |-----------|--------|--------|---------------|--------| mgn_node_0 mgn_node_1 ... mgn_clear_register ... mgn_node_i |-----------------| [0xA000, 0xAFFF] Skip the MGN_CLEAR register space when calculating the offset for node IDs greater than or equal to ten. Fixes: a6c2f87b8820 ("irqchip/mbigen: Implement the mbigen irq chip operation functions") Signed-off-by: Yipeng Zou Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240730014400.1751530-1-zouyipeng@huawei.com --- drivers/irqchip/irq-mbigen.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c index 093fd42893a7..53cc08387588 100644 --- a/drivers/irqchip/irq-mbigen.c +++ b/drivers/irqchip/irq-mbigen.c @@ -64,6 +64,20 @@ struct mbigen_device { void __iomem *base; }; +static inline unsigned int get_mbigen_node_offset(unsigned int nid) +{ + unsigned int offset = nid * MBIGEN_NODE_OFFSET; + + /* + * To avoid touched clear register in unexpected way, we need to directly + * skip clear register when access to more than 10 mbigen nodes. + */ + if (nid >= (REG_MBIGEN_CLEAR_OFFSET / MBIGEN_NODE_OFFSET)) + offset += MBIGEN_NODE_OFFSET; + + return offset; +} + static inline unsigned int get_mbigen_vec_reg(irq_hw_number_t hwirq) { unsigned int nid, pin; @@ -72,8 +86,7 @@ static inline unsigned int get_mbigen_vec_reg(irq_hw_number_t hwirq) nid = hwirq / IRQS_PER_MBIGEN_NODE + 1; pin = hwirq % IRQS_PER_MBIGEN_NODE; - return pin * 4 + nid * MBIGEN_NODE_OFFSET - + REG_MBIGEN_VEC_OFFSET; + return pin * 4 + get_mbigen_node_offset(nid) + REG_MBIGEN_VEC_OFFSET; } static inline void get_mbigen_type_reg(irq_hw_number_t hwirq, @@ -88,8 +101,7 @@ static inline void get_mbigen_type_reg(irq_hw_number_t hwirq, *mask = 1 << (irq_ofst % 32); ofst = irq_ofst / 32 * 4; - *addr = ofst + nid * MBIGEN_NODE_OFFSET - + REG_MBIGEN_TYPE_OFFSET; + *addr = ofst + get_mbigen_node_offset(nid) + REG_MBIGEN_TYPE_OFFSET; } static inline void get_mbigen_clear_reg(irq_hw_number_t hwirq, From 781f0bbbdade00f91490a3f64212e8cc8d75905c Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Wed, 24 Jul 2024 04:11:20 -0700 Subject: [PATCH 0312/1052] tools/bpf: Fix the wrong format specifier The format specifier of "unsigned int" in printf() should be "%u", not "%d". Signed-off-by: Zhu Jun Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240724111120.11625-1-zhujun2@cmss.chinamobile.com --- tools/bpf/bpftool/xlated_dumper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 567f56dfd9f1..d0094345fb2b 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -349,7 +349,7 @@ void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len, double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); - printf("% 4d: ", i); + printf("%4u: ", i); print_bpf_insn(&cbs, insn + i, true); if (opcodes) { @@ -415,7 +415,7 @@ void dump_xlated_for_graph(struct dump_data *dd, void *buf_start, void *buf_end, } } - printf("%d: ", insn_off); + printf("%u: ", insn_off); print_bpf_insn(&cbs, cur, true); if (opcodes) { From 7764b9622db4382b2797b54a70f292c8da6ef417 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 26 Jul 2024 20:08:47 +0200 Subject: [PATCH 0313/1052] bpf/selftests: Fix ASSERT_OK condition check in uprobe_syscall test Fixing ASSERT_OK condition check in uprobe_syscall test, otherwise we return from test on pipe success. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240726180847.684584-1-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index bd8c75b620c2..797de47f8197 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -253,7 +253,7 @@ static void test_uretprobe_syscall_call(void) struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c; - if (ASSERT_OK(pipe(go), "pipe")) + if (!ASSERT_OK(pipe(go), "pipe")) return; skel = uprobe_syscall_executed__open_and_load(); From c0247800ee7da5bc067b2916cf2722f755072307 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:17 -0700 Subject: [PATCH 0314/1052] selftests/bpf: Use portable POSIX basename() Use the POSIX version of basename() to allow compilation against non-gnu libc (e.g. musl). Include ahead of to enable using functions from the latter while preferring POSIX over GNU basename(). In veristat.c, rely on strdupa() to avoid basename() altering the passed "const char" argument. This is not needed in xskxceiver.c since the arg is mutable and the program exits immediately after usage. Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/0fd3c9f3c605e6cba33504213c9df287817ade04.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/veristat.c | 8 +++++--- tools/testing/selftests/bpf/xskxceiver.c | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index b2854238d4a0..11ec1190d582 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -2,6 +2,7 @@ /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #define _GNU_SOURCE #include +#include #include #include #include @@ -988,8 +989,8 @@ static void fixup_obj(struct bpf_object *obj, struct bpf_program *prog, const ch static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) { + const char *base_filename = basename(strdupa(filename)); const char *prog_name = bpf_program__name(prog); - const char *base_filename = basename(filename); char *buf; int buf_sz, log_level; struct verif_stats *stats; @@ -1056,13 +1057,14 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf static int process_obj(const char *filename) { + const char *base_filename = basename(strdupa(filename)); struct bpf_object *obj = NULL, *tobj; struct bpf_program *prog, *tprog, *lprog; libbpf_print_fn_t old_libbpf_print_fn; LIBBPF_OPTS(bpf_object_open_opts, opts); int err = 0, prog_cnt = 0; - if (!should_process_file_prog(basename(filename), NULL)) { + if (!should_process_file_prog(base_filename, NULL)) { if (env.verbose) printf("Skipping '%s' due to filters...\n", filename); env.files_skipped++; @@ -1076,7 +1078,7 @@ static int process_obj(const char *filename) } if (!env.quiet && env.out_fmt == RESFMT_TABLE) - printf("Processing '%s'...\n", basename(filename)); + printf("Processing '%s'...\n", base_filename); old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn); obj = bpf_object__open_file(filename, &opts); diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 8144fd145237..92af633faea8 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include From 03bfcda1fbc37ef34aa21d2b9e09138335afc6ee Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:18 -0700 Subject: [PATCH 0315/1052] selftests/bpf: Fix arg parsing in veristat, test_progs Current code parses arguments with strtok_r() using a construct like char *state = NULL; while ((next = strtok_r(state ? NULL : input, ",", &state))) { ... } where logic assumes the 'state' var can distinguish between first and subsequent strtok_r() calls, and adjusts parameters accordingly. However, 'state' is strictly internal context for strtok_r() and no such assumptions are supported in the man page. Moreover, the exact behaviour of 'state' depends on the libc implementation, making the above code fragile. Indeed, invoking "./test_progs -t " on mips64el/musl will hang, with the above code in an infinite loop. Similarly, we see strange behaviour running 'veristat' on mips64el/musl: $ ./veristat -e file,prog,verdict,insns -C two-ok add-failure Can't specify more than 9 stats Rewrite code using a counter to distinguish between strtok_r() calls. Fixes: 61ddff373ffa ("selftests/bpf: Improve by-name subtest selection logic in prog_tests") Fixes: 394169b079b5 ("selftests/bpf: add comparison mode to veristat") Fixes: c8bc5e050976 ("selftests/bpf: Add veristat tool for mass-verifying BPF object files") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/392d8bf5559f85fa37926c1494e62312ef252c3d.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/testing_helpers.c | 4 ++-- tools/testing/selftests/bpf/veristat.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index ac7c66f4fc7b..c217e12bd9da 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -221,13 +221,13 @@ int parse_test_list(const char *s, bool is_glob_pattern) { char *input, *state = NULL, *test_spec; - int err = 0; + int err = 0, cnt = 0; input = strdup(s); if (!input) return -ENOMEM; - while ((test_spec = strtok_r(state ? NULL : input, ",", &state))) { + while ((test_spec = strtok_r(cnt++ ? NULL : input, ",", &state))) { err = insert_test(set, test_spec, is_glob_pattern); if (err) break; diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 11ec1190d582..1ec5c4c47235 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -785,13 +785,13 @@ static int parse_stat(const char *stat_name, struct stat_specs *specs) static int parse_stats(const char *stats_str, struct stat_specs *specs) { char *input, *state = NULL, *next; - int err; + int err, cnt = 0; input = strdup(stats_str); if (!input) return -ENOMEM; - while ((next = strtok_r(state ? NULL : input, ",", &state))) { + while ((next = strtok_r(cnt++ ? NULL : input, ",", &state))) { err = parse_stat(next, specs); if (err) { free(input); @@ -1495,7 +1495,7 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs, while (fgets(line, sizeof(line), f)) { char *input = line, *state = NULL, *next; struct verif_stats *st = NULL; - int col = 0; + int col = 0, cnt = 0; if (!header) { void *tmp; @@ -1513,7 +1513,7 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs, *stat_cntp += 1; } - while ((next = strtok_r(state ? NULL : input, ",\n", &state))) { + while ((next = strtok_r(cnt++ ? NULL : input, ",\n", &state))) { if (header) { /* for the first line, set up spec stats */ err = parse_stat(next, specs); From cacf2a5a78cd1f5f616eae043ebc6f024104b721 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:19 -0700 Subject: [PATCH 0316/1052] selftests/bpf: Fix error compiling test_lru_map.c Although the post-increment in macro 'CPU_SET(next++, &cpuset)' seems safe, the sequencing can raise compile errors, so move the increment outside the macro. This avoids an error seen using gcc 12.3.0 for mips64el/musl-libc: In file included from test_lru_map.c:11: test_lru_map.c: In function 'sched_next_online': test_lru_map.c:129:29: error: operation on 'next' may be undefined [-Werror=sequence-point] 129 | CPU_SET(next++, &cpuset); | ^ cc1: all warnings being treated as errors Fixes: 3fbfadce6012 ("bpf: Fix test_lru_sanity5() in test_lru_map.c") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/22993dfb11ccf27925a626b32672fd3324cb76c4.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_lru_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index 4d0650cfb5cd..fda7589c5023 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -126,7 +126,8 @@ static int sched_next_online(int pid, int *next_to_try) while (next < nr_cpus) { CPU_ZERO(&cpuset); - CPU_SET(next++, &cpuset); + CPU_SET(next, &cpuset); + next++; if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) { ret = 0; break; From aa95073fd290b5b3e45f067fa22bb25e59e1ff7c Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:20 -0700 Subject: [PATCH 0317/1052] selftests/bpf: Fix C++ compile error from missing _Bool type While building, bpftool makes a skeleton from test_core_extern.c, which itself includes and uses the 'bool' type. However, the skeleton test_core_extern.skel.h generated *does not* include or use the 'bool' type, instead using the C-only '_Bool' type. Compiling test_cpp.cpp with g++ 12.3 for mips64el/musl-libc then fails with error: In file included from test_cpp.cpp:9: test_core_extern.skel.h:45:17: error: '_Bool' does not name a type 45 | _Bool CONFIG_BOOL; | ^~~~~ This was likely missed previously because glibc uses a GNU extension for with C++ (#define _Bool bool), not supported by musl libc. Normally, a C fragment would include and use the 'bool' type, and thus cleanly work after import by C++. The ideal fix would be for 'bpftool gen skeleton' to output the correct type/include supporting C++, but in the meantime add a conditional define as above. Fixes: 7c8dce4b1661 ("bpftool: Make skeleton C code compilable with C++ compiler") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/6fc1dd28b8bda49e51e4f610bdc9d22f4455632d.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_cpp.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index dde0bb16e782..abc2a56ab261 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -6,6 +6,10 @@ #include #include #include + +#ifndef _Bool +#define _Bool bool +#endif #include "test_core_extern.skel.h" #include "struct_ops_module.skel.h" From 16b795cc59528cf280abc79af3c70bda42f715b9 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:21 -0700 Subject: [PATCH 0318/1052] selftests/bpf: Fix redefinition errors compiling lwt_reroute.c Compiling lwt_reroute.c with GCC 12.3 for mips64el/musl-libc yields errors: In file included from .../include/arpa/inet.h:9, from ./test_progs.h:18, from tools/testing/selftests/bpf/prog_tests/lwt_helpers.h:11, from tools/testing/selftests/bpf/prog_tests/lwt_reroute.c:52: .../include/netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from .../include/linux/icmp.h:24, from tools/testing/selftests/bpf/prog_tests/lwt_helpers.h:9: .../include/linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../include/netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../include/linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../include/netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../include/linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ These errors occur because is included before , bypassing the Linux uapi/libc compat mechanism's partial musl support. As described in [1] and [2], fix these errors by including in lwt_reroute.c before any uapi headers. [1]: commit c0bace798436 ("uapi libc compat: add fallback for unsupported libcs") [2]: https://git.musl-libc.org/cgit/musl/commit/?id=04983f227238 Fixes: 6c77997bc639 ("selftests/bpf: Add lwt_xmit tests for BPF_REROUTE") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/bd2908aec0755ba8b75f5dc41848b00585f5c73e.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/lwt_reroute.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c index 03825d2b45a8..6c50c0f63f43 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c @@ -49,6 +49,7 @@ * is not crashed, it is considered successful. */ #define NETNS "ns_lwt_reroute" +#include #include "lwt_helpers.h" #include "network_helpers.h" #include From c9a83e76b5a96801a2c7ea0a79ca77c356d8b38d Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:22 -0700 Subject: [PATCH 0319/1052] selftests/bpf: Fix compile if backtrace support missing in libc Include GNU header only with glibc and provide weak, stubbed backtrace functions as a fallback in test_progs.c. This allows for non-GNU replacements while avoiding compile errors (e.g. with musl libc) like: test_progs.c:13:10: fatal error: execinfo.h: No such file or directory 13 | #include /* backtrace */ | ^~~~~~~~~~~~ test_progs.c: In function 'crash_handler': test_progs.c:1034:14: error: implicit declaration of function 'backtrace' [-Werror=implicit-function-declaration] 1034 | sz = backtrace(bt, ARRAY_SIZE(bt)); | ^~~~~~~~~ test_progs.c:1045:9: error: implicit declaration of function 'backtrace_symbols_fd' [-Werror=implicit-function-declaration] 1045 | backtrace_symbols_fd(bt, sz, STDERR_FILENO); | ^~~~~~~~~~~~~~~~~~~~ Fixes: 9fb156bb82a3 ("selftests/bpf: Print backtrace on SIGSEGV in test_progs") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/aa6dc8e23710cb457b278039d0081de7e7b4847d.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_progs.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 60c5ec0f6abf..d5d0cb4eb197 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -10,7 +10,6 @@ #include #include #include -#include /* backtrace */ #include /* get_nprocs */ #include #include @@ -19,6 +18,21 @@ #include #include "json_writer.h" +#ifdef __GLIBC__ +#include /* backtrace */ +#endif + +/* Default backtrace funcs if missing at link */ +__weak int backtrace(void **buffer, int size) +{ + return 0; +} + +__weak void backtrace_symbols_fd(void *const *buffer, int size, int fd) +{ + dprintf(fd, "\n"); +} + static bool verbose(void) { return env.verbosity > VERBOSE_NONE; From 06eeca1217a8655975f340e0ead6a396df74a2a4 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:23 -0700 Subject: [PATCH 0320/1052] selftests/bpf: Fix using stdout, stderr as struct field names Typically stdin, stdout, stderr are treated as reserved identifiers under ISO/ANSI C and libc implementations further define these as macros, both in glibc and musl . However, while glibc defines: ... /* Standard streams. */ extern FILE *stdin; /* Standard input stream. */ extern FILE *stdout; /* Standard output stream. */ extern FILE *stderr; /* Standard error output stream. */ /* C89/C99 say they're macros. Make them happy. */ #define stdin stdin #define stdout stdout #define stderr stderr ... musl instead uses (legally): ... extern FILE *const stdin; extern FILE *const stdout; extern FILE *const stderr; #define stdin (stdin) #define stdout (stdout) #define stderr (stderr) ... The latter results in compile errors when the names are reused as fields of 'struct test_env' and elsewhere in test_progs.[ch] and reg_bounds.c. Rename the fields to stdout_saved and stderr_saved to avoid many errors seen building against musl, e.g.: In file included from test_progs.h:6, from test_progs.c:5: test_progs.c: In function 'print_test_result': test_progs.c:237:21: error: expected identifier before '(' token 237 | fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); | ^~~~~~ test_progs.c:237:9: error: too few arguments to function 'fprintf' 237 | fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); | ^~~~~~~ Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/ZqR2DuHdBXPX%2Fyx8@kodidev-ubuntu/ Link: https://lore.kernel.org/bpf/684ea17548e237f39dfb3f7a3d33450069015b21.1722244708.git.tony.ambardar@gmail.com --- .../selftests/bpf/prog_tests/reg_bounds.c | 2 +- tools/testing/selftests/bpf/test_progs.c | 66 +++++++++---------- tools/testing/selftests/bpf/test_progs.h | 8 +-- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index 0da4225749bd..467027236d30 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -1487,7 +1487,7 @@ static int verify_case_opt(struct ctx *ctx, enum num_t init_t, enum num_t cond_t u64 elapsed_ns = get_time_ns() - ctx->start_ns; double remain_ns = elapsed_ns / progress * (1 - progress); - fprintf(env.stderr, "PROGRESS (%s): %d/%d (%.2lf%%), " + fprintf(env.stderr_saved, "PROGRESS (%s): %d/%d (%.2lf%%), " "elapsed %llu mins (%.2lf hrs), " "ETA %.0lf mins (%.2lf hrs)\n", ctx->progress_ctx, diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index d5d0cb4eb197..60fafa2f1ed7 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -51,15 +51,15 @@ static void stdio_hijack_init(char **log_buf, size_t *log_cnt) stdout = open_memstream(log_buf, log_cnt); if (!stdout) { - stdout = env.stdout; + stdout = env.stdout_saved; perror("open_memstream"); return; } if (env.subtest_state) - env.subtest_state->stdout = stdout; + env.subtest_state->stdout_saved = stdout; else - env.test_state->stdout = stdout; + env.test_state->stdout_saved = stdout; stderr = stdout; #endif @@ -73,8 +73,8 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt) return; } - env.stdout = stdout; - env.stderr = stderr; + env.stdout_saved = stdout; + env.stderr_saved = stderr; stdio_hijack_init(log_buf, log_cnt); #endif @@ -91,13 +91,13 @@ static void stdio_restore_cleanup(void) fflush(stdout); if (env.subtest_state) { - fclose(env.subtest_state->stdout); - env.subtest_state->stdout = NULL; - stdout = env.test_state->stdout; - stderr = env.test_state->stdout; + fclose(env.subtest_state->stdout_saved); + env.subtest_state->stdout_saved = NULL; + stdout = env.test_state->stdout_saved; + stderr = env.test_state->stdout_saved; } else { - fclose(env.test_state->stdout); - env.test_state->stdout = NULL; + fclose(env.test_state->stdout_saved); + env.test_state->stdout_saved = NULL; } #endif } @@ -110,13 +110,13 @@ static void stdio_restore(void) return; } - if (stdout == env.stdout) + if (stdout == env.stdout_saved) return; stdio_restore_cleanup(); - stdout = env.stdout; - stderr = env.stderr; + stdout = env.stdout_saved; + stderr = env.stderr_saved; #endif } @@ -244,25 +244,25 @@ static void print_test_result(const struct prog_test_def *test, const struct tes int skipped_cnt = test_state->skip_cnt; int subtests_cnt = test_state->subtest_num; - fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); + fprintf(env.stdout_saved, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); if (test_state->error_cnt) - fprintf(env.stdout, "FAIL"); + fprintf(env.stdout_saved, "FAIL"); else if (!skipped_cnt) - fprintf(env.stdout, "OK"); + fprintf(env.stdout_saved, "OK"); else if (skipped_cnt == subtests_cnt || !subtests_cnt) - fprintf(env.stdout, "SKIP"); + fprintf(env.stdout_saved, "SKIP"); else - fprintf(env.stdout, "OK (SKIP: %d/%d)", skipped_cnt, subtests_cnt); + fprintf(env.stdout_saved, "OK (SKIP: %d/%d)", skipped_cnt, subtests_cnt); - fprintf(env.stdout, "\n"); + fprintf(env.stdout_saved, "\n"); } static void print_test_log(char *log_buf, size_t log_cnt) { log_buf[log_cnt] = '\0'; - fprintf(env.stdout, "%s", log_buf); + fprintf(env.stdout_saved, "%s", log_buf); if (log_buf[log_cnt - 1] != '\n') - fprintf(env.stdout, "\n"); + fprintf(env.stdout_saved, "\n"); } static void print_subtest_name(int test_num, int subtest_num, @@ -273,14 +273,14 @@ static void print_subtest_name(int test_num, int subtest_num, snprintf(test_num_str, sizeof(test_num_str), "%d/%d", test_num, subtest_num); - fprintf(env.stdout, "#%-*s %s/%s", + fprintf(env.stdout_saved, "#%-*s %s/%s", TEST_NUM_WIDTH, test_num_str, test_name, subtest_name); if (result) - fprintf(env.stdout, ":%s", result); + fprintf(env.stdout_saved, ":%s", result); - fprintf(env.stdout, "\n"); + fprintf(env.stdout_saved, "\n"); } static void jsonw_write_log_message(json_writer_t *w, char *log_buf, size_t log_cnt) @@ -465,7 +465,7 @@ bool test__start_subtest(const char *subtest_name) memset(subtest_state, 0, sub_state_size); if (!subtest_name || !subtest_name[0]) { - fprintf(env.stderr, + fprintf(env.stderr_saved, "Subtest #%d didn't provide sub-test name!\n", state->subtest_num); return false; @@ -473,7 +473,7 @@ bool test__start_subtest(const char *subtest_name) subtest_state->name = strdup(subtest_name); if (!subtest_state->name) { - fprintf(env.stderr, + fprintf(env.stderr_saved, "Subtest #%d: failed to copy subtest name!\n", state->subtest_num); return false; @@ -1043,7 +1043,7 @@ void crash_handler(int signum) sz = backtrace(bt, ARRAY_SIZE(bt)); - if (env.stdout) + if (env.stdout_saved) stdio_restore(); if (env.test) { env.test_state->error_cnt++; @@ -1359,7 +1359,7 @@ static void calculate_summary_and_print_errors(struct test_env *env) if (env->json) { w = jsonw_new(env->json); if (!w) - fprintf(env->stderr, "Failed to create new JSON stream."); + fprintf(env->stderr_saved, "Failed to create new JSON stream."); } if (w) { @@ -1708,8 +1708,8 @@ int main(int argc, char **argv) return -1; } - env.stdout = stdout; - env.stderr = stderr; + env.stdout_saved = stdout; + env.stderr_saved = stderr; env.has_testmod = true; if (!env.list_test_names) { @@ -1717,7 +1717,7 @@ int main(int argc, char **argv) unload_bpf_testmod(verbose()); if (load_bpf_testmod(verbose())) { - fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); + fprintf(env.stderr_saved, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); env.has_testmod = false; } } @@ -1795,7 +1795,7 @@ int main(int argc, char **argv) } if (env.list_test_names) { - fprintf(env.stdout, "%s\n", test->test_name); + fprintf(env.stdout_saved, "%s\n", test->test_name); env.succ_cnt++; continue; } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index b1e949fb16cf..cb9d6d46826b 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -75,7 +75,7 @@ struct subtest_state { bool skipped; bool filtered; - FILE *stdout; + FILE *stdout_saved; }; struct test_state { @@ -92,7 +92,7 @@ struct test_state { size_t log_cnt; char *log_buf; - FILE *stdout; + FILE *stdout_saved; }; struct test_env { @@ -111,8 +111,8 @@ struct test_env { struct test_state *test_state; /* current running test state */ struct subtest_state *subtest_state; /* current running subtest state */ - FILE *stdout; - FILE *stderr; + FILE *stdout_saved; + FILE *stderr_saved; int nr_cpus; FILE *json; From 21c5f4f55da759c7444a1ef13e90b6e6f674eeeb Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:24 -0700 Subject: [PATCH 0321/1052] selftests/bpf: Fix error compiling tc_redirect.c with musl libc Linux 5.1 implemented 64-bit time types and related syscalls to address the Y2038 problem generally across archs. Userspace handling of Y2038 varies with the libc however. While musl libc uses 64-bit time across all 32-bit and 64-bit platforms, GNU glibc uses 64-bit time on 64-bit platforms but defaults to 32-bit time on 32-bit platforms unless they "opt-in" to 64-bit time or explicitly use 64-bit syscalls and time structures. One specific area is the standard setsockopt() call, SO_TIMESTAMPNS option used for timestamping, and the related output 'struct timespec'. GNU glibc defaults as above, also exposing the SO_TIMESTAMPNS_NEW flag to explicitly use a 64-bit call and 'struct __kernel_timespec'. Since these are not exposed or needed with musl libc, their use in tc_redirect.c leads to compile errors building for mips64el/musl: tc_redirect.c: In function 'rcv_tstamp': tc_redirect.c:425:32: error: 'SO_TIMESTAMPNS_NEW' undeclared (first use in this function); did you mean 'SO_TIMESTAMPNS'? 425 | cmsg->cmsg_type == SO_TIMESTAMPNS_NEW) | ^~~~~~~~~~~~~~~~~~ | SO_TIMESTAMPNS tc_redirect.c:425:32: note: each undeclared identifier is reported only once for each function it appears in tc_redirect.c: In function 'test_inet_dtime': tc_redirect.c:491:49: error: 'SO_TIMESTAMPNS_NEW' undeclared (first use in this function); did you mean 'SO_TIMESTAMPNS'? 491 | err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, | ^~~~~~~~~~~~~~~~~~ | SO_TIMESTAMPNS However, using SO_TIMESTAMPNS_NEW isn't strictly needed, nor is Y2038 being explicitly tested. The timestamp checks in tc_redirect.c are simple: the packet receive timestamp is non-zero and processed/handled in less than 5 seconds. Switch to using the standard setsockopt() call and SO_TIMESTAMPNS option to ensure compatibility across glibc and musl libc. In the worst-case, there is a 5-second window 14 years from now where tc_redirect tests may fail on 32-bit systems. However, we should reasonably expect glibc to adopt a 64-bit mandate rather than the current "opt-in" policy before the Y2038 roll-over. Fixes: ce6f6cffaeaa ("selftests/bpf: Wait for the netstamp_needed_key static key to be turned on") Fixes: c803475fd8dd ("bpf: selftests: test skb->tstamp in redirect_neigh") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/031d656c058b4e55ceae56ef49c4e1729b5090f3.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 327d51f59142..53b8ffc943dc 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -471,7 +471,7 @@ static int set_forwarding(bool enable) static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp) { - struct __kernel_timespec pkt_ts = {}; + struct timespec pkt_ts = {}; char ctl[CMSG_SPACE(sizeof(pkt_ts))]; struct timespec now_ts; struct msghdr msg = {}; @@ -495,7 +495,7 @@ static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp) cmsg = CMSG_FIRSTHDR(&msg); if (cmsg && cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SO_TIMESTAMPNS_NEW) + cmsg->cmsg_type == SO_TIMESTAMPNS) memcpy(&pkt_ts, CMSG_DATA(cmsg), sizeof(pkt_ts)); pkt_ns = pkt_ts.tv_sec * NSEC_PER_SEC + pkt_ts.tv_nsec; @@ -537,9 +537,9 @@ static int wait_netstamp_needed_key(void) if (!ASSERT_GE(srv_fd, 0, "start_server")) goto done; - err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS, &opt, sizeof(opt)); - if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)")) + if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)")) goto done; cli_fd = connect_to_fd(srv_fd, TIMEOUT_MILLIS); @@ -621,9 +621,9 @@ static void test_inet_dtime(int family, int type, const char *addr, __u16 port) return; /* Ensure the kernel puts the (rcv) timestamp for all skb */ - err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS, &opt, sizeof(opt)); - if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)")) + if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)")) goto done; if (type == SOCK_STREAM) { From 1cbe8143fd2f588031a5f157d15ae15ce12215e2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 30 Jul 2024 13:37:33 +0800 Subject: [PATCH 0322/1052] bpf: kprobe: Remove unused declaring of bpf_kprobe_override After the commit 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one"), "bpf_kprobe_override" is not used anywhere anymore, and we can remove it now. Fixes: 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one") Signed-off-by: Menglong Dong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240730053733.885785-1-dongml2@chinatelecom.cn --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..9435185c10ef 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -880,7 +880,6 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); -DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); From 84383b5ef4cd21b4a67de92afdc05a03b5247db9 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Thu, 25 Jul 2024 12:41:25 +0530 Subject: [PATCH 0323/1052] net: phy: micrel: Fix the KSZ9131 MDI-X status issue The MDIX status is not accurately reflecting the current state after the link partner has manually altered its MDIX configuration while operating in forced mode. Access information about Auto mdix completion and pair selection from the KSZ9131's Auto/MDI/MDI-X status register Fixes: b64e6a8794d9 ("net: phy: micrel: Add PHY Auto/MDI/MDI-X set driver for KSZ9131") Signed-off-by: Raju Lakkaraju Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240725071125.13960-1-Raju.Lakkaraju@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index dd519805deee..65b0a3115e14 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1389,6 +1389,8 @@ static int ksz9131_config_init(struct phy_device *phydev) const struct device *dev_walker; int ret; + phydev->mdix_ctrl = ETH_TP_MDI_AUTO; + dev_walker = &phydev->mdio.dev; do { of_node = dev_walker->of_node; @@ -1438,28 +1440,30 @@ static int ksz9131_config_init(struct phy_device *phydev) #define MII_KSZ9131_AUTO_MDIX 0x1C #define MII_KSZ9131_AUTO_MDI_SET BIT(7) #define MII_KSZ9131_AUTO_MDIX_SWAP_OFF BIT(6) +#define MII_KSZ9131_DIG_AXAN_STS 0x14 +#define MII_KSZ9131_DIG_AXAN_STS_LINK_DET BIT(14) +#define MII_KSZ9131_DIG_AXAN_STS_A_SELECT BIT(12) static int ksz9131_mdix_update(struct phy_device *phydev) { int ret; - ret = phy_read(phydev, MII_KSZ9131_AUTO_MDIX); - if (ret < 0) - return ret; - - if (ret & MII_KSZ9131_AUTO_MDIX_SWAP_OFF) { - if (ret & MII_KSZ9131_AUTO_MDI_SET) - phydev->mdix_ctrl = ETH_TP_MDI; - else - phydev->mdix_ctrl = ETH_TP_MDI_X; + if (phydev->mdix_ctrl != ETH_TP_MDI_AUTO) { + phydev->mdix = phydev->mdix_ctrl; } else { - phydev->mdix_ctrl = ETH_TP_MDI_AUTO; - } + ret = phy_read(phydev, MII_KSZ9131_DIG_AXAN_STS); + if (ret < 0) + return ret; - if (ret & MII_KSZ9131_AUTO_MDI_SET) - phydev->mdix = ETH_TP_MDI; - else - phydev->mdix = ETH_TP_MDI_X; + if (ret & MII_KSZ9131_DIG_AXAN_STS_LINK_DET) { + if (ret & MII_KSZ9131_DIG_AXAN_STS_A_SELECT) + phydev->mdix = ETH_TP_MDI; + else + phydev->mdix = ETH_TP_MDI_X; + } else { + phydev->mdix = ETH_TP_MDI_INVALID; + } + } return 0; } From 32654bbd6313b4cfc82297e6634fa9725c3c900f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 26 Jul 2024 15:20:48 -0700 Subject: [PATCH 0324/1052] xsk: Try to make xdp_umem_reg extension a bit more future-proof We recently found out that extending xsk_umem_reg might be a bit complicated due to not enforcing padding to be zero [0]. Add a couple of things to make it less error-prone: 1. Remove xdp_umem_reg_v2 since its sizeof is the same as xdp_umem_reg 2. Add BUILD_BUG_ON that checks that the size of xdp_umem_reg_v1 is less than xdp_umem_reg; presumably, when we get to v2, there is gonna be a similar line to enforce that sizeof(v2) > sizeof(v1) 3. Add BUILD_BUG_ON to make sure the last field plus its size matches the overall struct size. The intent is to demonstrate that we don't have any lingering padding. 0: https://lore.kernel.org/bpf/ZqI29QE+5JnkdPmE@boxer/T/#me03113f7c2458fd08f3c4114a7a9472ac3646c98 Reported-by: Julian Schindel Cc: Magnus Karlsson Reviewed-by: Maciej Fijalkowski Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20240726222048.1397869-1-sdf@fomichev.me Signed-off-by: Martin KaFai Lau --- net/xdp/xsk.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7e16336044b2..1140b2a120ca 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1320,14 +1320,6 @@ struct xdp_umem_reg_v1 { __u32 headroom; }; -struct xdp_umem_reg_v2 { - __u64 addr; /* Start of packet data area */ - __u64 len; /* Length of packet data area */ - __u32 chunk_size; - __u32 headroom; - __u32 flags; -}; - static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -1371,10 +1363,19 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; - else if (optlen < sizeof(struct xdp_umem_reg_v2)) - mr_size = sizeof(struct xdp_umem_reg_v1); else if (optlen < sizeof(mr)) - mr_size = sizeof(struct xdp_umem_reg_v2); + mr_size = sizeof(struct xdp_umem_reg_v1); + + BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); + + /* Make sure the last field of the struct doesn't have + * uninitialized padding. All padding has to be explicit + * and has to be set to zero by the userspace to make + * struct xdp_umem_reg extensible in the future. + */ + BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + + sizeof_field(struct xdp_umem_reg, tx_metadata_len) != + sizeof(struct xdp_umem_reg)); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; From 92cc2456e9775dc4333fb4aa430763ae4ac2f2d9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Jul 2024 16:18:05 -0700 Subject: [PATCH 0325/1052] selftests/bpf: fix RELEASE=1 compilation for sock_addr.c When building selftests with RELEASE=1 using GCC compiler, it complaints about uninitialized err. Fix the problem. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240730231805.1933923-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index b880c564a204..a6ee7f8d4f79 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -2642,6 +2642,7 @@ void test_sock_addr(void) break; default: ASSERT_TRUE(false, "Unknown sock addr test type"); + err = -EINVAL; break; } From e61dd678601eac53d501dda1eb1bcffec7b11bd6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Jul 2024 18:21:32 -0400 Subject: [PATCH 0326/1052] bcachefs: Fix double free of ca->buckets_nouse Reported-by: Dan Carpenter Fixes: ffcbec6076 ("bcachefs: Kill opts.buckets_nouse") Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0455a1001fec..e7fa2de35014 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1193,7 +1193,6 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); - kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); From a7f3abcf635767b2e19c4c55c4c35756595ebc86 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 29 Jul 2024 17:03:14 +0200 Subject: [PATCH 0327/1052] net: phy: aquantia: only poll GLOBAL_CFG regs on aqr113, aqr113c and aqr115c Commit 708405f3e56e ("net: phy: aquantia: wait for the GLOBAL_CFG to start returning real values") introduced a workaround for an issue observed on aqr115c. However there were never any reports of it happening on other models and the workaround has been reported to cause and issue on aqr113c (and it may cause the same on any other model not supporting 10M mode). Let's limit the impact of the workaround to aqr113, aqr113c and aqr115c and poll the 100M GLOBAL_CFG register instead as both models are known to support it correctly. Reported-by: Jon Hunter Closes: https://lore.kernel.org/lkml/7c0140be-4325-4005-9068-7e0fc5ff344d@nvidia.com/ Fixes: 708405f3e56e ("net: phy: aquantia: wait for the GLOBAL_CFG to start returning real values") Tested-by: Jon Hunter Signed-off-by: Bartosz Golaszewski Reviewed-by: Antoine Tenart Link: https://patch.msgid.link/20240729150315.65798-1-brgl@bgdev.pl Signed-off-by: Jakub Kicinski --- drivers/net/phy/aquantia/aquantia_main.c | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/net/phy/aquantia/aquantia_main.c b/drivers/net/phy/aquantia/aquantia_main.c index d12e35374231..e982e9ce44a5 100644 --- a/drivers/net/phy/aquantia/aquantia_main.c +++ b/drivers/net/phy/aquantia/aquantia_main.c @@ -653,13 +653,7 @@ static int aqr107_fill_interface_modes(struct phy_device *phydev) unsigned long *possible = phydev->possible_interfaces; unsigned int serdes_mode, rate_adapt; phy_interface_t interface; - int i, val, ret; - - ret = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, - VEND1_GLOBAL_CFG_10M, val, val != 0, - 1000, 100000, false); - if (ret) - return ret; + int i, val; /* Walk the media-speed configuration registers to determine which * host-side serdes modes may be used by the PHY depending on the @@ -708,6 +702,25 @@ static int aqr107_fill_interface_modes(struct phy_device *phydev) return 0; } +static int aqr113c_fill_interface_modes(struct phy_device *phydev) +{ + int val, ret; + + /* It's been observed on some models that - when coming out of suspend + * - the FW signals that the PHY is ready but the GLOBAL_CFG registers + * continue on returning zeroes for some time. Let's poll the 100M + * register until it returns a real value as both 113c and 115c support + * this mode. + */ + ret = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, + VEND1_GLOBAL_CFG_100M, val, val != 0, + 1000, 100000, false); + if (ret) + return ret; + + return aqr107_fill_interface_modes(phydev); +} + static int aqr113c_config_init(struct phy_device *phydev) { int ret; @@ -725,7 +738,7 @@ static int aqr113c_config_init(struct phy_device *phydev) if (ret) return ret; - return aqr107_fill_interface_modes(phydev); + return aqr113c_fill_interface_modes(phydev); } static int aqr107_probe(struct phy_device *phydev) From 89add40066f9ed9abe5f7f886fe5789ff7e0c50e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 29 Jul 2024 16:10:12 -0400 Subject: [PATCH 0328/1052] net: drop bad gso csum_start and offset in virtio_net_hdr Tighten csum_start and csum_offset checks in virtio_net_hdr_to_skb for GSO packets. The function already checks that a checksum requested with VIRTIO_NET_HDR_F_NEEDS_CSUM is in skb linear. But for GSO packets this might not hold for segs after segmentation. Syzkaller demonstrated to reach this warning in skb_checksum_help offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (WARN_ON_ONCE(offset >= skb_headlen(skb))) By injecting a TSO packet: WARNING: CPU: 1 PID: 3539 at net/core/dev.c:3284 skb_checksum_help+0x3d0/0x5b0 ip_do_fragment+0x209/0x1b20 net/ipv4/ip_output.c:774 ip_finish_output_gso net/ipv4/ip_output.c:279 [inline] __ip_finish_output+0x2bd/0x4b0 net/ipv4/ip_output.c:301 iptunnel_xmit+0x50c/0x930 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x2296/0x2c70 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x759/0xa60 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4850 [inline] netdev_start_xmit include/linux/netdevice.h:4864 [inline] xmit_one net/core/dev.c:3595 [inline] dev_hard_start_xmit+0x261/0x8c0 net/core/dev.c:3611 __dev_queue_xmit+0x1b97/0x3c90 net/core/dev.c:4261 packet_snd net/packet/af_packet.c:3073 [inline] The geometry of the bad input packet at tcp_gso_segment: [ 52.003050][ T8403] skb len=12202 headroom=244 headlen=12093 tailroom=0 [ 52.003050][ T8403] mac=(168,24) mac_len=24 net=(192,52) trans=244 [ 52.003050][ T8403] shinfo(txflags=0 nr_frags=1 gso(size=1552 type=3 segs=0)) [ 52.003050][ T8403] csum(0x60000c7 start=199 offset=1536 ip_summed=3 complete_sw=0 valid=0 level=0) Mitigate with stricter input validation. csum_offset: for GSO packets, deduce the correct value from gso_type. This is already done for USO. Extend it to TSO. Let UFO be: udp[46]_ufo_fragment ignores these fields and always computes the checksum in software. csum_start: finding the real offset requires parsing to the transport header. Do not add a parser, use existing segmentation parsing. Thanks to SKB_GSO_DODGY, that also catches bad packets that are hw offloaded. Again test both TSO and USO. Do not test UFO for the above reason, and do not test UDP tunnel offload. GSO packet are almost always CHECKSUM_PARTIAL. USO packets may be CHECKSUM_NONE since commit 10154dbded6d6 ("udp: Allow GSO transmit from devices with no checksum offload"), but then still these fields are initialized correctly in udp4_hwcsum/udp6_hwcsum_outgoing. So no need to test for ip_summed == CHECKSUM_PARTIAL first. This revises an existing fix mentioned in the Fixes tag, which broke small packets with GSO offload, as detected by kselftests. Link: https://syzkaller.appspot.com/bug?extid=e1db31216c789f552871 Link: https://lore.kernel.org/netdev/20240723223109.2196886-1-kuba@kernel.org Fixes: e269d79c7d35 ("net: missing check virtio") Cc: stable@vger.kernel.org Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20240729201108.1615114-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 16 +++++----------- net/ipv4/tcp_offload.c | 3 +++ net/ipv4/udp_offload.c | 4 ++++ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index d1d7825318c3..6c395a2600e8 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -56,7 +56,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; - u64 ret, remainder, gso_size; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { @@ -99,16 +98,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); - if (hdr->gso_size) { - gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); - ret = div64_u64_rem(skb->len, gso_size, &remainder); - if (!(ret && (hdr->gso_size > needed) && - ((remainder > needed) || (remainder == 0)))) { - return -EINVAL; - } - skb_shinfo(skb)->tx_flags |= SKBFL_SHARED_FRAG; - } - if (!pskb_may_pull(skb, needed)) return -EINVAL; @@ -182,6 +171,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (gso_type != SKB_GSO_UDP_L4) return -EINVAL; break; + case SKB_GSO_TCPV4: + case SKB_GSO_TCPV6: + if (skb->csum_offset != offsetof(struct tcphdr, check)) + return -EINVAL; + break; } /* Kernel has a special handling for GSO_BY_FRAGS. */ diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 4b791e74529e..e4ad3311e148 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -140,6 +140,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, if (thlen < sizeof(*th)) goto out; + if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb))) + goto out; + if (!pskb_may_pull(skb, thlen)) goto out; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index aa2e0a28ca61..bc8a9da750fe 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -278,6 +278,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); + if (unlikely(skb_checksum_start(gso_skb) != + skb_transport_header(gso_skb))) + return ERR_PTR(-EINVAL); + if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), From 16d731890db94e23d5483402494ef378f2271ba1 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 31 Jul 2024 09:19:50 +0200 Subject: [PATCH 0329/1052] dt-bindings: usb: microchip,usb2514: Add USB2517 compatible USB2517 is a 7-port variant of this USB hub. Add an USB compatible based on USB vendor & product ID. Signed-off-by: Alexander Stein Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240731071950.989113-1-alexander.stein@ew.tq-group.com Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/usb/microchip,usb2514.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml b/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml index 783c27591e56..245e8c3ce669 100644 --- a/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml +++ b/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml @@ -18,6 +18,7 @@ properties: - usb424,2412 - usb424,2417 - usb424,2514 + - usb424,2517 reg: true From 228a953e61d6d608a3facc1c3a27b9fb03c99de7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 17 Jul 2024 11:50:53 +0200 Subject: [PATCH 0330/1052] usb: gadget: midi2: Fix the response for FB info with block 0xff When the block number 0xff is given to Function Block Discovery message, the device should return the information of all Function Blocks, but currently the gadget driver treats it as an error. Implement the proper behavior for the block 0xff instead. Fixes: 8b645922b223 ("usb: gadget: Add support for USB MIDI 2.0 function driver") Cc: stable@vger.kernel.org Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240717095102.10493-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi2.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index 38e8ed3144f0..3f63253ad3e0 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -642,12 +642,21 @@ static void process_ump_stream_msg(struct f_midi2_ep *ep, const u32 *data) if (format) return; // invalid blk = (*data >> 8) & 0xff; - if (blk >= ep->num_blks) - return; - if (*data & UMP_STREAM_MSG_REQUEST_FB_INFO) - reply_ump_stream_fb_info(ep, blk); - if (*data & UMP_STREAM_MSG_REQUEST_FB_NAME) - reply_ump_stream_fb_name(ep, blk); + if (blk == 0xff) { + /* inquiry for all blocks */ + for (blk = 0; blk < ep->num_blks; blk++) { + if (*data & UMP_STREAM_MSG_REQUEST_FB_INFO) + reply_ump_stream_fb_info(ep, blk); + if (*data & UMP_STREAM_MSG_REQUEST_FB_NAME) + reply_ump_stream_fb_name(ep, blk); + } + } else if (blk < ep->num_blks) { + /* only the specified block */ + if (*data & UMP_STREAM_MSG_REQUEST_FB_INFO) + reply_ump_stream_fb_info(ep, blk); + if (*data & UMP_STREAM_MSG_REQUEST_FB_NAME) + reply_ump_stream_fb_name(ep, blk); + } return; } } From 76a7bfc445b8e9893c091e24ccfd4f51dfdc0a70 Mon Sep 17 00:00:00 2001 From: Chris Wulff Date: Sun, 21 Jul 2024 15:23:15 -0400 Subject: [PATCH 0331/1052] usb: gadget: u_audio: Check return codes from usb_ep_enable and config_ep_by_speed. These functions can fail if descriptors are malformed, or missing, for the selected USB speed. Fixes: eb9fecb9e69b ("usb: gadget: f_uac2: split out audio core") Fixes: 24f779dac8f3 ("usb: gadget: f_uac2/u_audio: add feedback endpoint support") Cc: stable@vger.kernel.org Signed-off-by: Chris Wulff Link: https://lore.kernel.org/r/20240721192314.3532697-2-crwulff@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_audio.c | 42 ++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/drivers/usb/gadget/function/u_audio.c b/drivers/usb/gadget/function/u_audio.c index 89af0feb7512..24299576972f 100644 --- a/drivers/usb/gadget/function/u_audio.c +++ b/drivers/usb/gadget/function/u_audio.c @@ -592,16 +592,25 @@ int u_audio_start_capture(struct g_audio *audio_dev) struct usb_ep *ep, *ep_fback; struct uac_rtd_params *prm; struct uac_params *params = &audio_dev->params; - int req_len, i; + int req_len, i, ret; prm = &uac->c_prm; dev_dbg(dev, "start capture with rate %d\n", prm->srate); ep = audio_dev->out_ep; - config_ep_by_speed(gadget, &audio_dev->func, ep); + ret = config_ep_by_speed(gadget, &audio_dev->func, ep); + if (ret < 0) { + dev_err(dev, "config_ep_by_speed for out_ep failed (%d)\n", ret); + return ret; + } + req_len = ep->maxpacket; prm->ep_enabled = true; - usb_ep_enable(ep); + ret = usb_ep_enable(ep); + if (ret < 0) { + dev_err(dev, "usb_ep_enable failed for out_ep (%d)\n", ret); + return ret; + } for (i = 0; i < params->req_number; i++) { if (!prm->reqs[i]) { @@ -629,9 +638,18 @@ int u_audio_start_capture(struct g_audio *audio_dev) return 0; /* Setup feedback endpoint */ - config_ep_by_speed(gadget, &audio_dev->func, ep_fback); + ret = config_ep_by_speed(gadget, &audio_dev->func, ep_fback); + if (ret < 0) { + dev_err(dev, "config_ep_by_speed in_ep_fback failed (%d)\n", ret); + return ret; // TODO: Clean up out_ep + } + prm->fb_ep_enabled = true; - usb_ep_enable(ep_fback); + ret = usb_ep_enable(ep_fback); + if (ret < 0) { + dev_err(dev, "usb_ep_enable failed for in_ep_fback (%d)\n", ret); + return ret; // TODO: Clean up out_ep + } req_len = ep_fback->maxpacket; req_fback = usb_ep_alloc_request(ep_fback, GFP_ATOMIC); @@ -687,13 +705,17 @@ int u_audio_start_playback(struct g_audio *audio_dev) struct uac_params *params = &audio_dev->params; unsigned int factor; const struct usb_endpoint_descriptor *ep_desc; - int req_len, i; + int req_len, i, ret; unsigned int p_pktsize; prm = &uac->p_prm; dev_dbg(dev, "start playback with rate %d\n", prm->srate); ep = audio_dev->in_ep; - config_ep_by_speed(gadget, &audio_dev->func, ep); + ret = config_ep_by_speed(gadget, &audio_dev->func, ep); + if (ret < 0) { + dev_err(dev, "config_ep_by_speed for in_ep failed (%d)\n", ret); + return ret; + } ep_desc = ep->desc; /* @@ -720,7 +742,11 @@ int u_audio_start_playback(struct g_audio *audio_dev) uac->p_residue_mil = 0; prm->ep_enabled = true; - usb_ep_enable(ep); + ret = usb_ep_enable(ep); + if (ret < 0) { + dev_err(dev, "usb_ep_enable failed for in_ep (%d)\n", ret); + return ret; + } for (i = 0; i < params->req_number; i++) { if (!prm->reqs[i]) { From afdcfd3d6fcdeca2735ca8d994c5f2d24a368f0a Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 9 Jul 2024 13:38:41 +0200 Subject: [PATCH 0332/1052] usb: vhci-hcd: Do not drop references before new references are gained At a few places the driver carries stale pointers to references that can still be used. Make sure that does not happen. This strictly speaking closes ZDI-CAN-22273, though there may be similar races in the driver. Signed-off-by: Oliver Neukum Cc: stable Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20240709113851.14691-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vhci_hcd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 82650c11e451..302a89aeb258 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -745,6 +745,7 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag * */ if (usb_pipedevice(urb->pipe) == 0) { + struct usb_device *old; __u8 type = usb_pipetype(urb->pipe); struct usb_ctrlrequest *ctrlreq = (struct usb_ctrlrequest *) urb->setup_packet; @@ -755,14 +756,15 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag goto no_need_xmit; } + old = vdev->udev; switch (ctrlreq->bRequest) { case USB_REQ_SET_ADDRESS: /* set_address may come when a device is reset */ dev_info(dev, "SetAddress Request (%d) to port %d\n", ctrlreq->wValue, vdev->rhport); - usb_put_dev(vdev->udev); vdev->udev = usb_get_dev(urb->dev); + usb_put_dev(old); spin_lock(&vdev->ud.lock); vdev->ud.status = VDEV_ST_USED; @@ -781,8 +783,8 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag usbip_dbg_vhci_hc( "Not yet?:Get_Descriptor to device 0 (get max pipe size)\n"); - usb_put_dev(vdev->udev); vdev->udev = usb_get_dev(urb->dev); + usb_put_dev(old); goto out; default: @@ -1067,6 +1069,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud) static void vhci_device_reset(struct usbip_device *ud) { struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); + struct usb_device *old = vdev->udev; unsigned long flags; spin_lock_irqsave(&ud->lock, flags); @@ -1074,8 +1077,8 @@ static void vhci_device_reset(struct usbip_device *ud) vdev->speed = 0; vdev->devid = 0; - usb_put_dev(vdev->udev); vdev->udev = NULL; + usb_put_dev(old); if (ud->tcp_socket) { sockfd_put(ud->tcp_socket); From 973a57891608a98e894db2887f278777f564de18 Mon Sep 17 00:00:00 2001 From: Chris Wulff Date: Wed, 24 Jul 2024 21:04:20 -0400 Subject: [PATCH 0333/1052] usb: gadget: core: Check for unset descriptor Make sure the descriptor has been set before looking at maxpacket. This fixes a null pointer panic in this case. This may happen if the gadget doesn't properly set up the endpoint for the current speed, or the gadget descriptors are malformed and the descriptor for the speed/endpoint are not found. No current gadget driver is known to have this problem, but this may cause a hard-to-find bug during development of new gadgets. Fixes: 54f83b8c8ea9 ("USB: gadget: Reject endpoints with 0 maxpacket value") Cc: stable@vger.kernel.org Signed-off-by: Chris Wulff Link: https://lore.kernel.org/r/20240725010419.314430-2-crwulff@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index b0a613758414..cf6478f97f4a 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -118,12 +118,10 @@ int usb_ep_enable(struct usb_ep *ep) goto out; /* UDC drivers can't handle endpoints with maxpacket size 0 */ - if (usb_endpoint_maxp(ep->desc) == 0) { - /* - * We should log an error message here, but we can't call - * dev_err() because there's no way to find the gadget - * given only ep. - */ + if (!ep->desc || usb_endpoint_maxp(ep->desc) == 0) { + WARN_ONCE(1, "%s: ep%d (%s) has %s\n", __func__, ep->address, ep->name, + (!ep->desc) ? "NULL descriptor" : "maxpacket 0"); + ret = -EINVAL; goto out; } From e885f5f1f2b43575aa8e4e31404132d77d6663d1 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Mon, 29 Jul 2024 10:42:58 +0200 Subject: [PATCH 0334/1052] usb: typec: fsa4480: Check if the chip is really there Currently, the driver will happily register the switch/mux devices, and so long as the i2c master doesn't complain, the user would never know there's something wrong. Add a device id check (based on [1]) and return -ENODEV if the read fails or returns nonsense. Checking the value on a Qualcomm SM6115P-based Lenovo Tab P11 tablet, the ID mentioned in the datasheet does indeed show up: fsa4480 1-0042: Found FSA4480 v1.1 (Vendor ID = 0) [1] https://www.onsemi.com/pdf/datasheet/fsa4480-d.pdf Fixes: 1dc246320c6b ("usb: typec: mux: Add On Semi fsa4480 driver") Cc: stable Reviewed-by: Dmitry Baryshkov Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240729-topic-fs4480_check-v3-1-f5bf732d3424@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/fsa4480.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/usb/typec/mux/fsa4480.c b/drivers/usb/typec/mux/fsa4480.c index cb7cdf90cb0a..cd235339834b 100644 --- a/drivers/usb/typec/mux/fsa4480.c +++ b/drivers/usb/typec/mux/fsa4480.c @@ -13,6 +13,10 @@ #include #include +#define FSA4480_DEVICE_ID 0x00 + #define FSA4480_DEVICE_ID_VENDOR_ID GENMASK(7, 6) + #define FSA4480_DEVICE_ID_VERSION_ID GENMASK(5, 3) + #define FSA4480_DEVICE_ID_REV_ID GENMASK(2, 0) #define FSA4480_SWITCH_ENABLE 0x04 #define FSA4480_SWITCH_SELECT 0x05 #define FSA4480_SWITCH_STATUS1 0x07 @@ -251,6 +255,7 @@ static int fsa4480_probe(struct i2c_client *client) struct typec_switch_desc sw_desc = { }; struct typec_mux_desc mux_desc = { }; struct fsa4480 *fsa; + int val = 0; int ret; fsa = devm_kzalloc(dev, sizeof(*fsa), GFP_KERNEL); @@ -268,6 +273,15 @@ static int fsa4480_probe(struct i2c_client *client) if (IS_ERR(fsa->regmap)) return dev_err_probe(dev, PTR_ERR(fsa->regmap), "failed to initialize regmap\n"); + ret = regmap_read(fsa->regmap, FSA4480_DEVICE_ID, &val); + if (ret || !val) + return dev_err_probe(dev, -ENODEV, "FSA4480 not found\n"); + + dev_dbg(dev, "Found FSA4480 v%lu.%lu (Vendor ID = %lu)\n", + FIELD_GET(FSA4480_DEVICE_ID_VERSION_ID, val), + FIELD_GET(FSA4480_DEVICE_ID_REV_ID, val), + FIELD_GET(FSA4480_DEVICE_ID_VENDOR_ID, val)); + /* Safe mode */ fsa->cur_enable = FSA4480_ENABLE_DEVICE | FSA4480_ENABLE_USB; fsa->mode = TYPEC_STATE_SAFE; From 3c526089a663e25ac78b6a61d84a52a83680d0c3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 12 Jul 2024 09:05:50 -0500 Subject: [PATCH 0335/1052] usb: typec: tcpci: Fix error code in tcpci_check_std_output_cap() The tcpci_check_std_output_cap() function is supposed to return negative error codes but it's declared as type bool so the error handling doesn't work. Declare it as an int instead. Fixes: 62ce9ef14797 ("usb: typec: tcpci: add support to set connector orientation") Signed-off-by: Dan Carpenter Reviewed-by: Heikki Krogerus Reviewed-by: Marco Felsch Link: https://lore.kernel.org/r/b0880888-6719-4614-91fc-8ee63b71d304@stanley.mountain Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c index b862fdf3fe1d..3e3dcb983dde 100644 --- a/drivers/usb/typec/tcpm/tcpci.c +++ b/drivers/usb/typec/tcpm/tcpci.c @@ -67,7 +67,7 @@ static int tcpci_write16(struct tcpci *tcpci, unsigned int reg, u16 val) return regmap_raw_write(tcpci->regmap, reg, &val, sizeof(u16)); } -static bool tcpci_check_std_output_cap(struct regmap *regmap, u8 mask) +static int tcpci_check_std_output_cap(struct regmap *regmap, u8 mask) { unsigned int reg; int ret; From 5a444bea37e2759549ef72bfe83d1c8712e76b3d Mon Sep 17 00:00:00 2001 From: Prashanth K Date: Tue, 30 Jul 2024 18:27:54 +0530 Subject: [PATCH 0336/1052] usb: gadget: u_serial: Set start_delayed during suspend Upstream commit aba3a8d01d62 ("usb: gadget: u_serial: add suspend resume callbacks") added started_delayed flag, so that new ports which are opened after USB suspend can start IO while resuming. But if the port was already opened, and gadget suspend kicks in afterwards, start_delayed will never be set. This causes resume to bail out before calling gs_start_io(). Fix this by setting start_delayed during suspend. Fixes: aba3a8d01d62 ("usb: gadget: u_serial: add suspend resume callbacks") Cc: stable@vger.kernel.org Signed-off-by: Prashanth K Link: https://lore.kernel.org/r/20240730125754.576326-1-quic_prashk@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_serial.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c index eec7f7a2e40f..b394105e55d6 100644 --- a/drivers/usb/gadget/function/u_serial.c +++ b/drivers/usb/gadget/function/u_serial.c @@ -1441,6 +1441,7 @@ void gserial_suspend(struct gserial *gser) spin_lock(&port->port_lock); spin_unlock(&serial_port_lock); port->suspended = true; + port->start_delayed = true; spin_unlock_irqrestore(&port->port_lock, flags); } EXPORT_SYMBOL_GPL(gserial_suspend); From 8290b567621ba4e3ccf45ec9d67e0507196c5ddc Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Wed, 24 Jul 2024 09:23:50 -0700 Subject: [PATCH 0337/1052] usb: typec: tipd: Fix dereferencing freeing memory in tps6598x_apply_patch() release_firmware() already frees fw, fix this my moving release_firmware after the dereference. Fixes: 916b8e5fa73d ("usb: typec: tipd: add error log to provide firmware name and size") Signed-off-by: Harshit Mogalapalli Reviewed-by: Heikki Krogerus Reviewed-by: Javier Carrasco Link: https://lore.kernel.org/r/20240724162356.992763-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tipd/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tipd/core.c b/drivers/usb/typec/tipd/core.c index ea768b19a7f1..eb5596e3406a 100644 --- a/drivers/usb/typec/tipd/core.c +++ b/drivers/usb/typec/tipd/core.c @@ -1191,11 +1191,11 @@ static int tps6598x_apply_patch(struct tps6598x *tps) dev_info(tps->dev, "Firmware update succeeded\n"); release_fw: - release_firmware(fw); if (ret) { dev_err(tps->dev, "Failed to write patch %s of %zu bytes\n", firmware_name, fw->size); } + release_firmware(fw); return ret; }; From b1dad2f091382b0049c72dab8153779248fa8016 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Wed, 24 Jul 2024 09:23:51 -0700 Subject: [PATCH 0338/1052] usb: typec: tipd: Delete extra semi-colon There shouldn't be a ; at the end of the function, delete it. Signed-off-by: Harshit Mogalapalli Reviewed-by: Javier Carrasco Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240724162356.992763-2-harshit.m.mogalapalli@oracle.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tipd/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tipd/core.c b/drivers/usb/typec/tipd/core.c index eb5596e3406a..dd51a25480bf 100644 --- a/drivers/usb/typec/tipd/core.c +++ b/drivers/usb/typec/tipd/core.c @@ -1198,7 +1198,7 @@ static int tps6598x_apply_patch(struct tps6598x *tps) release_firmware(fw); return ret; -}; +} static int cd321x_init(struct tps6598x *tps) { From 6ccf9984d6be3c2f804087b736db05c2ec42664b Mon Sep 17 00:00:00 2001 From: Edmund Raile Date: Tue, 30 Jul 2024 19:53:26 +0000 Subject: [PATCH 0339/1052] Revert "ALSA: firewire-lib: obsolete workqueue for period update" prepare resolution of AB/BA deadlock competition for substream lock: restore workqueue previously used for process context: revert commit b5b519965c4c ("ALSA: firewire-lib: obsolete workqueue for period update") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/kwryofzdmjvzkuw6j3clftsxmoolynljztxqwg76hzeo4simnl@jn3eo7pe642q/ Signed-off-by: Edmund Raile Reviewed-by: Takashi Sakamoto Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20240730195318.869840-2-edmund.raile@protonmail.com --- sound/firewire/amdtp-stream.c | 15 +++++++++++++++ sound/firewire/amdtp-stream.h | 1 + 2 files changed, 16 insertions(+) diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index d35d0a420ee0..31201d506a21 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -77,6 +77,8 @@ // overrun. Actual device can skip more, then this module stops the packet streaming. #define IR_JUMBO_PAYLOAD_MAX_SKIP_CYCLES 5 +static void pcm_period_work(struct work_struct *work); + /** * amdtp_stream_init - initialize an AMDTP stream structure * @s: the AMDTP stream to initialize @@ -105,6 +107,7 @@ int amdtp_stream_init(struct amdtp_stream *s, struct fw_unit *unit, s->flags = flags; s->context = ERR_PTR(-1); mutex_init(&s->mutex); + INIT_WORK(&s->period_work, pcm_period_work); s->packet_index = 0; init_waitqueue_head(&s->ready_wait); @@ -347,6 +350,7 @@ EXPORT_SYMBOL(amdtp_stream_get_max_payload); */ void amdtp_stream_pcm_prepare(struct amdtp_stream *s) { + cancel_work_sync(&s->period_work); s->pcm_buffer_pointer = 0; s->pcm_period_pointer = 0; } @@ -624,6 +628,16 @@ static void update_pcm_pointers(struct amdtp_stream *s, } } +static void pcm_period_work(struct work_struct *work) +{ + struct amdtp_stream *s = container_of(work, struct amdtp_stream, + period_work); + struct snd_pcm_substream *pcm = READ_ONCE(s->pcm); + + if (pcm) + snd_pcm_period_elapsed(pcm); +} + static int queue_packet(struct amdtp_stream *s, struct fw_iso_packet *params, bool sched_irq) { @@ -1910,6 +1924,7 @@ static void amdtp_stream_stop(struct amdtp_stream *s) return; } + cancel_work_sync(&s->period_work); fw_iso_context_stop(s->context); fw_iso_context_destroy(s->context); s->context = ERR_PTR(-1); diff --git a/sound/firewire/amdtp-stream.h b/sound/firewire/amdtp-stream.h index a1ed2e80f91a..775db3fc4959 100644 --- a/sound/firewire/amdtp-stream.h +++ b/sound/firewire/amdtp-stream.h @@ -191,6 +191,7 @@ struct amdtp_stream { /* For a PCM substream processing. */ struct snd_pcm_substream *pcm; + struct work_struct period_work; snd_pcm_uframes_t pcm_buffer_pointer; unsigned int pcm_period_pointer; unsigned int pcm_frame_multiplier; From 3dab73ab925a51ab05543b491bf17463a48ca323 Mon Sep 17 00:00:00 2001 From: Edmund Raile Date: Tue, 30 Jul 2024 19:53:29 +0000 Subject: [PATCH 0340/1052] Revert "ALSA: firewire-lib: operate for period elapse event in process context" Commit 7ba5ca32fe6e ("ALSA: firewire-lib: operate for period elapse event in process context") removed the process context workqueue from amdtp_domain_stream_pcm_pointer() and update_pcm_pointers() to remove its overhead. With RME Fireface 800, this lead to a regression since Kernels 5.14.0, causing an AB/BA deadlock competition for the substream lock with eventual system freeze under ALSA operation: thread 0: * (lock A) acquire substream lock by snd_pcm_stream_lock_irq() in snd_pcm_status64() * (lock B) wait for tasklet to finish by calling tasklet_unlock_spin_wait() in tasklet_disable_in_atomic() in ohci_flush_iso_completions() of ohci.c thread 1: * (lock B) enter tasklet * (lock A) attempt to acquire substream lock, waiting for it to be released: snd_pcm_stream_lock_irqsave() in snd_pcm_period_elapsed() in update_pcm_pointers() in process_ctx_payloads() in process_rx_packets() of amdtp-stream.c ? tasklet_unlock_spin_wait ohci_flush_iso_completions firewire_ohci amdtp_domain_stream_pcm_pointer snd_firewire_lib snd_pcm_update_hw_ptr0 snd_pcm snd_pcm_status64 snd_pcm ? native_queued_spin_lock_slowpath _raw_spin_lock_irqsave snd_pcm_period_elapsed snd_pcm process_rx_packets snd_firewire_lib irq_target_callback snd_firewire_lib handle_it_packet firewire_ohci context_tasklet firewire_ohci Restore the process context work queue to prevent deadlock AB/BA deadlock competition for ALSA substream lock of snd_pcm_stream_lock_irq() in snd_pcm_status64() and snd_pcm_stream_lock_irqsave() in snd_pcm_period_elapsed(). revert commit 7ba5ca32fe6e ("ALSA: firewire-lib: operate for period elapse event in process context") Replace inline description to prevent future deadlock. Cc: stable@vger.kernel.org Fixes: 7ba5ca32fe6e ("ALSA: firewire-lib: operate for period elapse event in process context") Reported-by: edmund.raile Closes: https://lore.kernel.org/r/kwryofzdmjvzkuw6j3clftsxmoolynljztxqwg76hzeo4simnl@jn3eo7pe642q/ Signed-off-by: Edmund Raile Reviewed-by: Takashi Sakamoto Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20240730195318.869840-3-edmund.raile@protonmail.com --- sound/firewire/amdtp-stream.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index 31201d506a21..7438999e0510 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -615,16 +615,8 @@ static void update_pcm_pointers(struct amdtp_stream *s, // The program in user process should periodically check the status of intermediate // buffer associated to PCM substream to process PCM frames in the buffer, instead // of receiving notification of period elapsed by poll wait. - if (!pcm->runtime->no_period_wakeup) { - if (in_softirq()) { - // In software IRQ context for 1394 OHCI. - snd_pcm_period_elapsed(pcm); - } else { - // In process context of ALSA PCM application under acquired lock of - // PCM substream. - snd_pcm_period_elapsed_under_stream_lock(pcm); - } - } + if (!pcm->runtime->no_period_wakeup) + queue_work(system_highpri_wq, &s->period_work); } } @@ -1864,11 +1856,14 @@ unsigned long amdtp_domain_stream_pcm_pointer(struct amdtp_domain *d, { struct amdtp_stream *irq_target = d->irq_target; - // Process isochronous packets queued till recent isochronous cycle to handle PCM frames. if (irq_target && amdtp_stream_running(irq_target)) { - // In software IRQ context, the call causes dead-lock to disable the tasklet - // synchronously. - if (!in_softirq()) + // use wq to prevent AB/BA deadlock competition for + // substream lock: + // fw_iso_context_flush_completions() acquires + // lock by ohci_flush_iso_completions(), + // amdtp-stream process_rx_packets() attempts to + // acquire same lock by snd_pcm_elapsed() + if (current_work() != &s->period_work) fw_iso_context_flush_completions(irq_target->context); } From 6e73c490445ae77c52f62fcf9a49193d17c6f79a Mon Sep 17 00:00:00 2001 From: Luis Felipe Hernandez Date: Tue, 30 Jul 2024 20:15:59 -0400 Subject: [PATCH 0341/1052] platform/x86: msi-wmi-platform: Fix spelling mistakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There were a few instances of typos that lead could to confusion when reading. The following words have been corrected: Binay -> Binary singe -> single chaged -> changed Signed-off-by: Luis Felipe Hernandez Reviewed-by: Armin Wolf Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20240731001602.259338-1-luis.hernandez093@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- Documentation/wmi/devices/msi-wmi-platform.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/wmi/devices/msi-wmi-platform.rst b/Documentation/wmi/devices/msi-wmi-platform.rst index 29b1b2e6d42c..31a136942892 100644 --- a/Documentation/wmi/devices/msi-wmi-platform.rst +++ b/Documentation/wmi/devices/msi-wmi-platform.rst @@ -130,12 +130,12 @@ data using the `bmfdec `_ utility: Due to a peculiarity in how Windows handles the ``CreateByteField()`` ACPI operator (errors only happen when a invalid byte field is ultimately accessed), all methods require a 32 byte input -buffer, even if the Binay MOF says otherwise. +buffer, even if the Binary MOF says otherwise. The input buffer contains a single byte to select the subfeature to be accessed and 31 bytes of input data, the meaning of which depends on the subfeature being accessed. -The output buffer contains a singe byte which signals success or failure (``0x00`` on failure) +The output buffer contains a single byte which signals success or failure (``0x00`` on failure) and 31 bytes of output data, the meaning if which depends on the subfeature being accessed. WMI method Get_EC() @@ -147,7 +147,7 @@ data contains a flag byte and a 28 byte controller firmware version string. The first 4 bits of the flag byte contain the minor version of the embedded controller interface, with the next 2 bits containing the major version of the embedded controller interface. -The 7th bit signals if the embedded controller page chaged (exact meaning is unknown), and the +The 7th bit signals if the embedded controller page changed (exact meaning is unknown), and the last bit signals if the platform is a Tigerlake platform. The MSI software seems to only use this interface when the last bit is set. From 3114f77e9453daa292ec0906f313a715c69b5943 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 30 Jul 2024 15:59:30 +0000 Subject: [PATCH 0342/1052] platform/x86/intel/ifs: Initialize union ifs_status to zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the IFS scan test exits prematurely due to a timeout before completing a single run, the union ifs_status remains uninitialized, leading to incorrect test status reporting. To prevent this, always initialize the union ifs_status to zero. Fixes: 2b40e654b73a ("platform/x86/intel/ifs: Add scan test support") Suggested-by: Ilpo Järvinen Reviewed-by: Jithu Joseph Reviewed-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/r/20240730155930.1754744-1-sathyanarayanan.kuppuswamy@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/ifs/runtest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/ifs/runtest.c b/drivers/platform/x86/intel/ifs/runtest.c index 282e4bfe30da..be3d51ed0e47 100644 --- a/drivers/platform/x86/intel/ifs/runtest.c +++ b/drivers/platform/x86/intel/ifs/runtest.c @@ -221,8 +221,8 @@ static int doscan(void *data) */ static void ifs_test_core(int cpu, struct device *dev) { + union ifs_status status = {}; union ifs_scan activate; - union ifs_status status; unsigned long timeout; struct ifs_data *ifsd; int to_start, to_stop; From 6eabce6608d6f3440f4c03aa3d3ef50a47a3d193 Mon Sep 17 00:00:00 2001 From: George Kennedy Date: Wed, 17 Jul 2024 07:24:38 -0500 Subject: [PATCH 0343/1052] serial: core: check uartclk for zero to avoid divide by zero Calling ioctl TIOCSSERIAL with an invalid baud_base can result in uartclk being zero, which will result in a divide by zero error in uart_get_divisor(). The check for uartclk being zero in uart_set_info() needs to be done before other settings are made as subsequent calls to ioctl TIOCSSERIAL for the same port would be impacted if the uartclk check was done where uartclk gets set. Oops: divide error: 0000 PREEMPT SMP KASAN PTI RIP: 0010:uart_get_divisor (drivers/tty/serial/serial_core.c:580) Call Trace: serial8250_get_divisor (drivers/tty/serial/8250/8250_port.c:2576 drivers/tty/serial/8250/8250_port.c:2589) serial8250_do_set_termios (drivers/tty/serial/8250/8250_port.c:502 drivers/tty/serial/8250/8250_port.c:2741) serial8250_set_termios (drivers/tty/serial/8250/8250_port.c:2862) uart_change_line_settings (./include/linux/spinlock.h:376 ./include/linux/serial_core.h:608 drivers/tty/serial/serial_core.c:222) uart_port_startup (drivers/tty/serial/serial_core.c:342) uart_startup (drivers/tty/serial/serial_core.c:368) uart_set_info (drivers/tty/serial/serial_core.c:1034) uart_set_info_user (drivers/tty/serial/serial_core.c:1059) tty_set_serial (drivers/tty/tty_io.c:2637) tty_ioctl (drivers/tty/tty_io.c:2647 drivers/tty/tty_io.c:2791) __x64_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:907 fs/ioctl.c:893 fs/ioctl.c:893) do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Reported-by: syzkaller Cc: stable@vger.kernel.org Signed-off-by: George Kennedy Rule: add Link: https://lore.kernel.org/stable/1721148848-9784-1-git-send-email-george.kennedy%40oracle.com Link: https://lore.kernel.org/r/1721219078-3209-1-git-send-email-george.kennedy@oracle.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 9a18d0b95a41..5bea3af46abc 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -881,6 +881,14 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port, new_flags = (__force upf_t)new_info->flags; old_custom_divisor = uport->custom_divisor; + if (!(uport->flags & UPF_FIXED_PORT)) { + unsigned int uartclk = new_info->baud_base * 16; + /* check needs to be done here before other settings made */ + if (uartclk == 0) { + retval = -EINVAL; + goto exit; + } + } if (!capable(CAP_SYS_ADMIN)) { retval = -EPERM; if (change_irq || change_port || From f844793f2d374ffb7f2231f4d36cd4bc52f12de0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Jul 2024 16:41:18 +0200 Subject: [PATCH 0344/1052] thermal: trip: Avoid skipping trips in thermal_zone_set_trips() Say there are 3 trip points A, B, C sorted in ascending temperature order with no hysteresis. If the zone temerature is exactly equal to B, thermal_zone_set_trips() will set the boundaries to A and C and the hardware will not catch any crossing of B (either way) until either A or C is crossed and the boundaries are changed. To avoid that, use non-strict inequalities when comparing the trip threshold to the zone temperature in thermal_zone_set_trips(). In the example above, it will cause both boundaries to be set to B, which is desirable because an interrupt will trigger when the zone temperature becomes different from B regardless of which way it goes. That will allow a new interval to be set depending on the direction of the zone temperature change. Fixes: 893bae92237d ("thermal: trip: Make thermal_zone_set_trips() use trip thresholds") Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/12509184.O9o76ZdvQC@rjwysocki.net --- drivers/thermal/thermal_trip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c index c0b679b846b3..06a0554ddc38 100644 --- a/drivers/thermal/thermal_trip.c +++ b/drivers/thermal/thermal_trip.c @@ -88,10 +88,10 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz) return; for_each_trip_desc(tz, td) { - if (td->threshold < tz->temperature && td->threshold > low) + if (td->threshold <= tz->temperature && td->threshold > low) low = td->threshold; - if (td->threshold > tz->temperature && td->threshold < high) + if (td->threshold >= tz->temperature && td->threshold < high) high = td->threshold; } From 133f4c00b8b2bfcacead9b81e7e8edfceb4b06c4 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Tue, 23 Jul 2024 08:53:00 -0400 Subject: [PATCH 0345/1052] serial: sc16is7xx: fix TX fifo corruption Sometimes, when a packet is received on channel A at almost the same time as a packet is about to be transmitted on channel B, we observe with a logic analyzer that the received packet on channel A is transmitted on channel B. In other words, the Tx buffer data on channel B is corrupted with data from channel A. The problem appeared since commit 4409df5866b7 ("serial: sc16is7xx: change EFR lock to operate on each channels"), which changed the EFR locking to operate on each channel instead of chip-wise. This commit has introduced a regression, because the EFR lock is used not only to protect the EFR registers access, but also, in a very obscure and undocumented way, to protect access to the data buffer, which is shared by the Tx and Rx handlers, but also by each channel of the IC. Fix this regression first by switching to kfifo_out_linear_ptr() in sc16is7xx_handle_tx() to eliminate the need for a shared Rx/Tx buffer. Secondly, replace the chip-wise Rx buffer with a separate Rx buffer for each channel. Fixes: 4409df5866b7 ("serial: sc16is7xx: change EFR lock to operate on each channels") Cc: stable@vger.kernel.org Signed-off-by: Hugo Villeneuve Link: https://lore.kernel.org/r/20240723125302.1305372-2-hugo@hugovil.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index c79dcd7c8d1a..58696e05492c 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -327,6 +327,7 @@ struct sc16is7xx_one { struct kthread_work reg_work; struct kthread_delayed_work ms_work; struct sc16is7xx_one_config config; + unsigned char buf[SC16IS7XX_FIFO_SIZE]; /* Rx buffer. */ unsigned int old_mctrl; u8 old_lcr; /* Value before EFR access. */ bool irda_mode; @@ -340,7 +341,6 @@ struct sc16is7xx_port { unsigned long gpio_valid_mask; #endif u8 mctrl_mask; - unsigned char buf[SC16IS7XX_FIFO_SIZE]; struct kthread_worker kworker; struct task_struct *kworker_task; struct sc16is7xx_one p[]; @@ -612,18 +612,18 @@ static int sc16is7xx_set_baud(struct uart_port *port, int baud) static void sc16is7xx_handle_rx(struct uart_port *port, unsigned int rxlen, unsigned int iir) { - struct sc16is7xx_port *s = dev_get_drvdata(port->dev); + struct sc16is7xx_one *one = to_sc16is7xx_one(port, port); unsigned int lsr = 0, bytes_read, i; bool read_lsr = (iir == SC16IS7XX_IIR_RLSE_SRC) ? true : false; u8 ch, flag; - if (unlikely(rxlen >= sizeof(s->buf))) { + if (unlikely(rxlen >= sizeof(one->buf))) { dev_warn_ratelimited(port->dev, "ttySC%i: Possible RX FIFO overrun: %d\n", port->line, rxlen); port->icount.buf_overrun++; /* Ensure sanity of RX level */ - rxlen = sizeof(s->buf); + rxlen = sizeof(one->buf); } while (rxlen) { @@ -636,10 +636,10 @@ static void sc16is7xx_handle_rx(struct uart_port *port, unsigned int rxlen, lsr = 0; if (read_lsr) { - s->buf[0] = sc16is7xx_port_read(port, SC16IS7XX_RHR_REG); + one->buf[0] = sc16is7xx_port_read(port, SC16IS7XX_RHR_REG); bytes_read = 1; } else { - sc16is7xx_fifo_read(port, s->buf, rxlen); + sc16is7xx_fifo_read(port, one->buf, rxlen); bytes_read = rxlen; } @@ -672,7 +672,7 @@ static void sc16is7xx_handle_rx(struct uart_port *port, unsigned int rxlen, } for (i = 0; i < bytes_read; ++i) { - ch = s->buf[i]; + ch = one->buf[i]; if (uart_handle_sysrq_char(port, ch)) continue; @@ -690,10 +690,10 @@ static void sc16is7xx_handle_rx(struct uart_port *port, unsigned int rxlen, static void sc16is7xx_handle_tx(struct uart_port *port) { - struct sc16is7xx_port *s = dev_get_drvdata(port->dev); struct tty_port *tport = &port->state->port; unsigned long flags; unsigned int txlen; + unsigned char *tail; if (unlikely(port->x_char)) { sc16is7xx_port_write(port, SC16IS7XX_THR_REG, port->x_char); @@ -718,8 +718,9 @@ static void sc16is7xx_handle_tx(struct uart_port *port) txlen = 0; } - txlen = uart_fifo_out(port, s->buf, txlen); - sc16is7xx_fifo_write(port, s->buf, txlen); + txlen = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail, txlen); + sc16is7xx_fifo_write(port, tail, txlen); + uart_xmit_advance(port, txlen); uart_port_lock_irqsave(port, &flags); if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS) From 7d3b793faaab1305994ce568b59d61927235f57b Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Tue, 23 Jul 2024 08:53:01 -0400 Subject: [PATCH 0346/1052] serial: sc16is7xx: fix invalid FIFO access with special register set When enabling access to the special register set, Receiver time-out and RHR interrupts can happen. In this case, the IRQ handler will try to read from the FIFO thru the RHR register at address 0x00, but address 0x00 is mapped to DLL register, resulting in erroneous FIFO reading. Call graph example: sc16is7xx_startup(): entry sc16is7xx_ms_proc(): entry sc16is7xx_set_termios(): entry sc16is7xx_set_baud(): DLH/DLL = $009C --> access special register set sc16is7xx_port_irq() entry --> IIR is 0x0C sc16is7xx_handle_rx() entry sc16is7xx_fifo_read(): --> unable to access FIFO (RHR) because it is mapped to DLL (LCR=LCR_CONF_MODE_A) sc16is7xx_set_baud(): exit --> Restore access to general register set Fix the problem by claiming the efr_lock mutex when accessing the Special register set. Fixes: dfeae619d781 ("serial: sc16is7xx") Cc: stable@vger.kernel.org Signed-off-by: Hugo Villeneuve Link: https://lore.kernel.org/r/20240723125302.1305372-3-hugo@hugovil.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index 58696e05492c..b4c1798a1df2 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -592,6 +592,8 @@ static int sc16is7xx_set_baud(struct uart_port *port, int baud) SC16IS7XX_MCR_CLKSEL_BIT, prescaler == 1 ? 0 : SC16IS7XX_MCR_CLKSEL_BIT); + mutex_lock(&one->efr_lock); + /* Backup LCR and access special register set (DLL/DLH) */ lcr = sc16is7xx_port_read(port, SC16IS7XX_LCR_REG); sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, @@ -606,6 +608,8 @@ static int sc16is7xx_set_baud(struct uart_port *port, int baud) /* Restore LCR and access to general register set */ sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, lcr); + mutex_unlock(&one->efr_lock); + return DIV_ROUND_CLOSEST((clk / prescaler) / 16, div); } From 6e20753da6bc651e02378a0cdb78f16c42098c88 Mon Sep 17 00:00:00 2001 From: Max Krummenacher Date: Thu, 25 Jul 2024 15:20:45 +0200 Subject: [PATCH 0347/1052] tty: vt: conmakehash: cope with abs_srctree no longer in env conmakehash uses getenv("abs_srctree") from the environment to strip the absolute path from the generated sources. However since commit e2bad142bb3d ("kbuild: unexport abs_srctree and abs_objtree") this environment variable no longer gets set. Instead use basename() to indicate the used file in a comment of the generated source file. Fixes: 3bd85c6c97b2 ("tty: vt: conmakehash: Don't mention the full path of the input in output") Cc: stable Signed-off-by: Max Krummenacher Link: https://lore.kernel.org/stable/20240725132056.9151-1-max.oss.09%40gmail.com Link: https://lore.kernel.org/r/20240725132056.9151-1-max.oss.09@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/conmakehash.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/tty/vt/conmakehash.c b/drivers/tty/vt/conmakehash.c index dc2177fec715..82d9db68b2ce 100644 --- a/drivers/tty/vt/conmakehash.c +++ b/drivers/tty/vt/conmakehash.c @@ -11,6 +11,8 @@ * Copyright (C) 1995-1997 H. Peter Anvin */ +#include +#include #include #include #include @@ -76,8 +78,8 @@ static void addpair(int fp, int un) int main(int argc, char *argv[]) { FILE *ctbl; - const char *tblname, *rel_tblname; - const char *abs_srctree; + const char *tblname; + char base_tblname[PATH_MAX]; char buffer[65536]; int fontlen; int i, nuni, nent; @@ -102,16 +104,6 @@ int main(int argc, char *argv[]) } } - abs_srctree = getenv("abs_srctree"); - if (abs_srctree && !strncmp(abs_srctree, tblname, strlen(abs_srctree))) - { - rel_tblname = tblname + strlen(abs_srctree); - while (*rel_tblname == '/') - ++rel_tblname; - } - else - rel_tblname = tblname; - /* For now we assume the default font is always 256 characters. */ fontlen = 256; @@ -253,6 +245,8 @@ int main(int argc, char *argv[]) for ( i = 0 ; i < fontlen ; i++ ) nuni += unicount[i]; + strncpy(base_tblname, tblname, PATH_MAX); + base_tblname[PATH_MAX - 1] = 0; printf("\ /*\n\ * Do not edit this file; it was automatically generated by\n\ @@ -264,7 +258,7 @@ int main(int argc, char *argv[]) #include \n\ \n\ u8 dfont_unicount[%d] = \n\ -{\n\t", rel_tblname, fontlen); +{\n\t", basename(base_tblname), fontlen); for ( i = 0 ; i < fontlen ; i++ ) { From 6881e75237a84093d0986f56223db3724619f26e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Jul 2024 12:23:51 +0200 Subject: [PATCH 0348/1052] tick/broadcast: Move per CPU pointer access into the atomic section The recent fix for making the take over of the broadcast timer more reliable retrieves a per CPU pointer in preemptible context. This went unnoticed as compilers hoist the access into the non-preemptible region where the pointer is actually used. But of course it's valid that the compiler keeps it at the place where the code puts it which rightfully triggers: BUG: using smp_processor_id() in preemptible [00000000] code: caller is hotplug_cpu__broadcast_tick_pull+0x1c/0xc0 Move it to the actual usage site which is in a non-preemptible region. Fixes: f7d43dd206e7 ("tick/broadcast: Make takeover of broadcast hrtimer reliable") Reported-by: David Wang <00107082@163.com> Signed-off-by: Thomas Gleixner Tested-by: Yu Liao Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/87ttg56ers.ffs@tglx --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b4843099a8da..ed58eebb4e8f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,7 +1141,6 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { - struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1167,6 +1166,8 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) * device to avoid the starvation. */ if (tick_check_broadcast_expired()) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); tick_program_event(td->evtdev->next_event, 1); } From 224fa3552029a3d14bec7acf72ded8171d551b88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 Jul 2024 12:43:21 +0200 Subject: [PATCH 0349/1052] jump_label: Fix the fix, brown paper bags galore Per the example of: !atomic_cmpxchg(&key->enabled, 0, 1) the inverse was written as: atomic_cmpxchg(&key->enabled, 1, 0) except of course, that while !old is only true for old == 0, old is true for everything except old == 0. Fix it to read: atomic_cmpxchg(&key->enabled, 1, 0) == 1 such that only the 1->0 transition returns true and goes on to disable the keys. Fixes: 83ab38ef0a0b ("jump_label: Fix concurrency issues in static_key_slow_dec()") Reported-by: Darrick J. Wong Signed-off-by: Peter Zijlstra (Intel) Tested-by: Darrick J. Wong Link: https://lkml.kernel.org/r/20240731105557.GY33588@noisy.programming.kicks-ass.net --- kernel/jump_label.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4ad5ed8adf96..6dc76b590703 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -236,7 +236,7 @@ void static_key_disable_cpuslocked(struct static_key *key) } jump_label_lock(); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } @@ -289,7 +289,7 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key) return; guard(mutex)(&jump_label_mutex); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); else WARN_ON_ONCE(!static_key_slow_try_dec(key)); From f73cefa3b72eaa90abfc43bf6d68137ba059d4b1 Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Tue, 30 Jul 2024 06:09:28 +0800 Subject: [PATCH 0350/1052] perf/x86: Fix smp_processor_id()-in-preemptible warnings The following bug was triggered on a system built with CONFIG_DEBUG_PREEMPT=y: # echo p > /proc/sysrq-trigger BUG: using smp_processor_id() in preemptible [00000000] code: sh/117 caller is perf_event_print_debug+0x1a/0x4c0 CPU: 3 UID: 0 PID: 117 Comm: sh Not tainted 6.11.0-rc1 #109 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x4f/0x60 check_preemption_disabled+0xc8/0xd0 perf_event_print_debug+0x1a/0x4c0 __handle_sysrq+0x140/0x180 write_sysrq_trigger+0x61/0x70 proc_reg_write+0x4e/0x70 vfs_write+0xd0/0x430 ? handle_mm_fault+0xc8/0x240 ksys_write+0x9c/0xd0 do_syscall_64+0x96/0x190 entry_SYSCALL_64_after_hwframe+0x4b/0x53 This is because the commit d4b294bf84db ("perf/x86: Hybrid PMU support for counters") took smp_processor_id() outside the irq critical section. If a preemption occurs in perf_event_print_debug() and the task is migrated to another cpu, we may get incorrect pmu debug information. Move smp_processor_id() back inside the irq critical section to fix this issue. Fixes: d4b294bf84db ("perf/x86: Hybrid PMU support for counters") Signed-off-by: Li Huafei Reviewed-and-tested-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20240729220928.325449-1-lihuafei1@huawei.com --- arch/x86/events/core.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 12f2a0c14d33..be01823b1bb4 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1520,20 +1520,23 @@ static void x86_pmu_start(struct perf_event *event, int flags) void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + unsigned long *cntr_mask, *fixed_cntr_mask; + struct event_constraint *pebs_constraints; + struct cpu_hw_events *cpuc; u64 pebs, debugctl; - int cpu = smp_processor_id(); - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask); - unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); - struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); - unsigned long flags; - int idx; + int cpu, idx; + + guard(irqsave)(); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_events, cpu); + cntr_mask = hybrid(cpuc->pmu, cntr_mask); + fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); + pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); if (!*(u64 *)cntr_mask) return; - local_irq_save(flags); - if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -1577,7 +1580,6 @@ void perf_event_print_debug(void) pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } - local_irq_restore(flags); } void x86_pmu_stop(struct perf_event *event, int flags) From cd04d50979502a1a965869dcd246d44db1bf0153 Mon Sep 17 00:00:00 2001 From: Andrew Ballance Date: Mon, 8 Jul 2024 19:44:26 -0500 Subject: [PATCH 0351/1052] rust: firmware: fix invalid rustdoc link remove an extra quote from the doc comment so that rustdoc no longer genertes a link to a nonexistent file. Signed-off-by: Andrew Ballance Reviewed-by: Danilo Krummrich Acked-by: Miguel Ojeda Fixes: de6582833db0 ("rust: add firmware abstractions") Link: https://lore.kernel.org/r/20240709004426.44854-1-andrewjballance@gmail.com Signed-off-by: Greg Kroah-Hartman --- rust/kernel/firmware.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/firmware.rs b/rust/kernel/firmware.rs index 2ba03af9f036..dee5b4b18aec 100644 --- a/rust/kernel/firmware.rs +++ b/rust/kernel/firmware.rs @@ -2,7 +2,7 @@ //! Firmware abstraction //! -//! C header: [`include/linux/firmware.h`](srctree/include/linux/firmware.h") +//! C header: [`include/linux/firmware.h`](srctree/include/linux/firmware.h) use crate::{bindings, device::Device, error::Error, error::Result, str::CStr}; use core::ptr::NonNull; From a2e4bdca2c361260609d47dff6c0e36ef2b41d4c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 30 Jul 2024 18:09:31 +0200 Subject: [PATCH 0352/1052] Documentation: embargoed-hardware-issues.rst: minor cleanups and fixes The embargoed-hardware-issues.rst file needed a bunch of minor grammar, punctuation, and syntax cleanups based on feedback we have gotten over the past few years. The main change here is the term "silicon" being used over "hardware" to differentiate between companies that make a chip (i.e. a CPU) and those that take the chip and put it into their system. No process changes are made here at all, only clarification for the way the current process works. All of these changes have been approved by a review from a large number of different open source legal members, representing the companies involved in this process. Acked-by: Jonathan Corbet Link: https://lore.kernel.org/r/2024073032-outsource-sniff-e8ea@gregkh Co-developed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner Co-developed-by: Michael Dolan Signed-off-by: Michael Dolan Co-developed-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- .../process/embargoed-hardware-issues.rst | 122 ++++++++++-------- 1 file changed, 65 insertions(+), 57 deletions(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 6e9a4597bf2c..2b34bb6b7cda 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -13,9 +13,9 @@ kernel. Hardware issues like Meltdown, Spectre, L1TF etc. must be treated differently because they usually affect all Operating Systems ("OS") and therefore need coordination across different OS vendors, distributions, -hardware vendors and other parties. For some of the issues, software -mitigations can depend on microcode or firmware updates, which need further -coordination. +silicon vendors, hardware integrators, and other parties. For some of the +issues, software mitigations can depend on microcode or firmware updates, +which need further coordination. .. _Contact: @@ -32,8 +32,8 @@ Linux kernel security team (:ref:`Documentation/admin-guide/ `) instead. The team can be contacted by email at . This -is a private list of security officers who will help you to coordinate a -fix according to our documented process. +is a private list of security officers who will help you coordinate a fix +according to our documented process. The list is encrypted and email to the list can be sent by either PGP or S/MIME encrypted and must be signed with the reporter's PGP key or S/MIME @@ -43,7 +43,7 @@ the following URLs: - PGP: https://www.kernel.org/static/files/hardware-security.asc - S/MIME: https://www.kernel.org/static/files/hardware-security.crt -While hardware security issues are often handled by the affected hardware +While hardware security issues are often handled by the affected silicon vendor, we welcome contact from researchers or individuals who have identified a potential hardware flaw. @@ -65,7 +65,7 @@ of Linux Foundation's IT operations personnel technically have the ability to access the embargoed information, but are obliged to confidentiality by their employment contract. Linux Foundation IT personnel are also responsible for operating and managing the rest of -kernel.org infrastructure. +kernel.org's infrastructure. The Linux Foundation's current director of IT Project infrastructure is Konstantin Ryabitsev. @@ -85,7 +85,7 @@ Memorandum of Understanding The Linux kernel community has a deep understanding of the requirement to keep hardware security issues under embargo for coordination between -different OS vendors, distributors, hardware vendors and other parties. +different OS vendors, distributors, silicon vendors, and other parties. The Linux kernel community has successfully handled hardware security issues in the past and has the necessary mechanisms in place to allow @@ -103,11 +103,11 @@ the issue in the best technical way. All involved developers pledge to adhere to the embargo rules and to keep the received information confidential. Violation of the pledge will lead to immediate exclusion from the current issue and removal from all related -mailing-lists. In addition, the hardware security team will also exclude +mailing lists. In addition, the hardware security team will also exclude the offender from future issues. The impact of this consequence is a highly effective deterrent in our community. In case a violation happens the hardware security team will inform the involved parties immediately. If you -or anyone becomes aware of a potential violation, please report it +or anyone else becomes aware of a potential violation, please report it immediately to the Hardware security officers. @@ -124,14 +124,16 @@ method for these types of issues. Start of Disclosure """"""""""""""""""" -Disclosure starts by contacting the Linux kernel hardware security team by -email. This initial contact should contain a description of the problem and -a list of any known affected hardware. If your organization builds or -distributes the affected hardware, we encourage you to also consider what -other hardware could be affected. +Disclosure starts by emailing the Linux kernel hardware security team per +the Contact section above. This initial contact should contain a +description of the problem and a list of any known affected silicon. If +your organization builds or distributes the affected hardware, we encourage +you to also consider what other hardware could be affected. The disclosing +party is responsible for contacting the affected silicon vendors in a +timely manner. The hardware security team will provide an incident-specific encrypted -mailing-list which will be used for initial discussion with the reporter, +mailing list which will be used for initial discussion with the reporter, further disclosure, and coordination of fixes. The hardware security team will provide the disclosing party a list of @@ -158,8 +160,8 @@ This serves several purposes: - The disclosed entities can be contacted to name experts who should participate in the mitigation development. - - If an expert which is required to handle an issue is employed by an - listed entity or member of an listed entity, then the response teams can + - If an expert who is required to handle an issue is employed by a listed + entity or member of an listed entity, then the response teams can request the disclosure of that expert from that entity. This ensures that the expert is also part of the entity's response team. @@ -169,8 +171,8 @@ Disclosure The disclosing party provides detailed information to the initial response team via the specific encrypted mailing-list. -From our experience the technical documentation of these issues is usually -a sufficient starting point and further technical clarification is best +From our experience, the technical documentation of these issues is usually +a sufficient starting point, and further technical clarification is best done via email. Mitigation development @@ -179,35 +181,39 @@ Mitigation development The initial response team sets up an encrypted mailing-list or repurposes an existing one if appropriate. -Using a mailing-list is close to the normal Linux development process and -has been successfully used in developing mitigations for various hardware +Using a mailing list is close to the normal Linux development process and +has been successfully used to develop mitigations for various hardware security issues in the past. -The mailing-list operates in the same way as normal Linux development. -Patches are posted, discussed and reviewed and if agreed on applied to a -non-public git repository which is only accessible to the participating +The mailing list operates in the same way as normal Linux development. +Patches are posted, discussed, and reviewed and if agreed upon, applied to +a non-public git repository which is only accessible to the participating developers via a secure connection. The repository contains the main development branch against the mainline kernel and backport branches for stable kernel versions as necessary. The initial response team will identify further experts from the Linux -kernel developer community as needed. Bringing in experts can happen at any -time of the development process and needs to be handled in a timely manner. +kernel developer community as needed. Any involved party can suggest +further experts to be included, each of which will be subject to the same +requirements outlined above. -If an expert is employed by or member of an entity on the disclosure list +Bringing in experts can happen at any time in the development process and +needs to be handled in a timely manner. + +If an expert is employed by or a member of an entity on the disclosure list provided by the disclosing party, then participation will be requested from the relevant entity. -If not, then the disclosing party will be informed about the experts +If not, then the disclosing party will be informed about the experts' participation. The experts are covered by the Memorandum of Understanding -and the disclosing party is requested to acknowledge the participation. In -case that the disclosing party has a compelling reason to object, then this -objection has to be raised within five work days and resolved with the -incident team immediately. If the disclosing party does not react within -five work days this is taken as silent acknowledgement. +and the disclosing party is requested to acknowledge their participation. +In the case where the disclosing party has a compelling reason to object, +any objection must to be raised within five working days and resolved with +the incident team immediately. If the disclosing party does not react +within five working days this is taken as silent acknowledgment. -After acknowledgement or resolution of an objection the expert is disclosed -by the incident team and brought into the development process. +After the incident team acknowledges or resolves an objection, the expert +is disclosed and brought into the development process. List participants may not communicate about the issue outside of the private mailing list. List participants may not use any shared resources @@ -217,19 +223,20 @@ private mailing list. List participants may not use any shared resources Coordinated release """"""""""""""""""" -The involved parties will negotiate the date and time where the embargo -ends. At that point the prepared mitigations are integrated into the -relevant kernel trees and published. There is no pre-notification process: -fixes are published in public and available to everyone at the same time. +The involved parties will negotiate the date and time when the embargo +ends. At that point, the prepared mitigations are published into the +relevant kernel trees. There is no pre-notification process: the +mitigations are published in public and available to everyone at the same +time. While we understand that hardware security issues need coordinated embargo -time, the embargo time should be constrained to the minimum time which is -required for all involved parties to develop, test and prepare the +time, the embargo time should be constrained to the minimum time that is +required for all involved parties to develop, test, and prepare their mitigations. Extending embargo time artificially to meet conference talk -dates or other non-technical reasons is creating more work and burden for -the involved developers and response teams as the patches need to be kept -up to date in order to follow the ongoing upstream kernel development, -which might create conflicting changes. +dates or other non-technical reasons creates more work and burden for the +involved developers and response teams as the patches need to be kept up to +date in order to follow the ongoing upstream kernel development, which +might create conflicting changes. CVE assignment """""""""""""" @@ -275,34 +282,35 @@ an involved disclosed party. The current ambassadors list: If you want your organization to be added to the ambassadors list, please contact the hardware security team. The nominated ambassador has to -understand and support our process fully and is ideally well connected in +understand and support our process fully and is ideally well-connected in the Linux kernel community. Encrypted mailing-lists ----------------------- -We use encrypted mailing-lists for communication. The operating principle +We use encrypted mailing lists for communication. The operating principle of these lists is that email sent to the list is encrypted either with the -list's PGP key or with the list's S/MIME certificate. The mailing-list +list's PGP key or with the list's S/MIME certificate. The mailing list software decrypts the email and re-encrypts it individually for each subscriber with the subscriber's PGP key or S/MIME certificate. Details -about the mailing-list software and the setup which is used to ensure the +about the mailing list software and the setup that is used to ensure the security of the lists and protection of the data can be found here: https://korg.wiki.kernel.org/userdoc/remail. List keys ^^^^^^^^^ -For initial contact see :ref:`Contact`. For incident specific mailing-lists -the key and S/MIME certificate are conveyed to the subscribers by email -sent from the specific list. +For initial contact see the :ref:`Contact` section above. For incident +specific mailing lists, the key and S/MIME certificate are conveyed to the +subscribers by email sent from the specific list. -Subscription to incident specific lists +Subscription to incident-specific lists ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Subscription is handled by the response teams. Disclosed parties who want -to participate in the communication send a list of potential subscribers to -the response team so the response team can validate subscription requests. +Subscription to incident-specific lists is handled by the response teams. +Disclosed parties who want to participate in the communication send a list +of potential experts to the response team so the response team can validate +subscription requests. Each subscriber needs to send a subscription request to the response team by email. The email must be signed with the subscriber's PGP key or S/MIME From 86fee2877f3427df5876159a182aa70d10964cdf Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 30 Jul 2024 18:09:32 +0200 Subject: [PATCH 0353/1052] Documentation: embargoed-hardware-issues.rst: add a section documenting the "early access" process Over the past years there have been many "misunderstandings" and "confusion" as to who is, and is not, allowed early access to the changes created by the members of the embargoed hardware issue teams working on a specific problem. The current process, while it does work, is "difficult" for many companies to understand and agree with. Because of this, there has been numerous attempts by many companies to work around the process by lies, subterfuge, and other side channels sometimes involving unsuspecting lawyers. Cut all of that out, and put the responsibility of distributing code on the silicon vendor affected, as they already have legal agreements in place that cover this type of distribution. When this distribution happens, the developers involved MUST be notified of this happening, to be kept aware of the situation at all times. The wording here has been hashed out by many different companies and lawyers involved in the process, as well as community members and everyone now agrees that the proposed change here should work better than what is currently happening. This change has been approved by a review from a large number of different open source legal members, representing the companies involved in this process. Link: https://lore.kernel.org/r/2024073035-bagel-vertigo-e0dd@gregkh Co-developed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner Co-developed-by: Michael Dolan Signed-off-by: Michael Dolan Co-developed-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- .../process/embargoed-hardware-issues.rst | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 2b34bb6b7cda..daebce49cfdf 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -219,6 +219,37 @@ List participants may not communicate about the issue outside of the private mailing list. List participants may not use any shared resources (e.g. employer build farms, CI systems, etc) when working on patches. +Early access +"""""""""""" + +The patches discussed and developed on the list can neither be distributed +to any individual who is not a member of the response team nor to any other +organization. + +To allow the affected silicon vendors to work with their internal teams and +industry partners on testing, validation, and logistics, the following +exception is provided: + + Designated representatives of the affected silicon vendors are + allowed to hand over the patches at any time to the silicon + vendor’s response team. The representative must notify the kernel + response team about the handover. The affected silicon vendor must + have and maintain their own documented security process for any + patches shared with their response team that is consistent with + this policy. + + The silicon vendor’s response team can distribute these patches to + their industry partners and to their internal teams under the + silicon vendor’s documented security process. Feedback from the + industry partners goes back to the silicon vendor and is + communicated by the silicon vendor to the kernel response team. + + The handover to the silicon vendor’s response team removes any + responsibility or liability from the kernel response team regarding + premature disclosure, which happens due to the involvement of the + silicon vendor’s internal teams or industry partners. The silicon + vendor guarantees this release of liability by agreeing to this + process. Coordinated release """"""""""""""""""" From be62f1289df01b7083f9ee5daf2a27d81355d666 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 30 Jul 2024 07:43:21 -0700 Subject: [PATCH 0354/1052] fsi: add missing MODULE_DESCRIPTION() macros make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/fsi/fsi-core.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/fsi/fsi-master-hub.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/fsi/fsi-master-aspeed.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/fsi/fsi-master-gpio.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/fsi/fsi-master-ast-cf.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/fsi/fsi-scom.o Add the missing invocations of the MODULE_DESCRIPTION() macro, and fix the copy/paste of the module description comment in fsi-master-ast-cf.c. Reviewed-by: Eddie James Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240730-module_description_orphans-v1-4-7094088076c8@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/fsi/fsi-core.c | 1 + drivers/fsi/fsi-master-aspeed.c | 1 + drivers/fsi/fsi-master-ast-cf.c | 3 ++- drivers/fsi/fsi-master-gpio.c | 1 + drivers/fsi/fsi-master-hub.c | 1 + drivers/fsi/fsi-scom.c | 1 + 6 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/fsi/fsi-core.c b/drivers/fsi/fsi-core.c index 46ac5a8beab7..e2e1e9df6115 100644 --- a/drivers/fsi/fsi-core.c +++ b/drivers/fsi/fsi-core.c @@ -1444,5 +1444,6 @@ static void fsi_exit(void) } module_exit(fsi_exit); module_param(discard_errors, int, 0664); +MODULE_DESCRIPTION("FSI core driver"); MODULE_LICENSE("GPL"); MODULE_PARM_DESC(discard_errors, "Don't invoke error handling on bus accesses"); diff --git a/drivers/fsi/fsi-master-aspeed.c b/drivers/fsi/fsi-master-aspeed.c index b0b624c3717b..6f5e1bdf7e40 100644 --- a/drivers/fsi/fsi-master-aspeed.c +++ b/drivers/fsi/fsi-master-aspeed.c @@ -670,4 +670,5 @@ static struct platform_driver fsi_master_aspeed_driver = { }; module_platform_driver(fsi_master_aspeed_driver); +MODULE_DESCRIPTION("FSI master driver for AST2600"); MODULE_LICENSE("GPL"); diff --git a/drivers/fsi/fsi-master-ast-cf.c b/drivers/fsi/fsi-master-ast-cf.c index f8c776ce1b56..a4c37ff8edd6 100644 --- a/drivers/fsi/fsi-master-ast-cf.c +++ b/drivers/fsi/fsi-master-ast-cf.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ // Copyright 2018 IBM Corp /* - * A FSI master controller, using a simple GPIO bit-banging interface + * A FSI master based on Aspeed ColdFire coprocessor */ #include @@ -1438,5 +1438,6 @@ static struct platform_driver fsi_master_acf = { }; module_platform_driver(fsi_master_acf); +MODULE_DESCRIPTION("A FSI master based on Aspeed ColdFire coprocessor"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(FW_FILE_NAME); diff --git a/drivers/fsi/fsi-master-gpio.c b/drivers/fsi/fsi-master-gpio.c index 10fc344b6b22..f761344f4873 100644 --- a/drivers/fsi/fsi-master-gpio.c +++ b/drivers/fsi/fsi-master-gpio.c @@ -892,4 +892,5 @@ static struct platform_driver fsi_master_gpio_driver = { }; module_platform_driver(fsi_master_gpio_driver); +MODULE_DESCRIPTION("A FSI master controller, using a simple GPIO bit-banging interface"); MODULE_LICENSE("GPL"); diff --git a/drivers/fsi/fsi-master-hub.c b/drivers/fsi/fsi-master-hub.c index 6d8b6e8854e5..6568fed7db3c 100644 --- a/drivers/fsi/fsi-master-hub.c +++ b/drivers/fsi/fsi-master-hub.c @@ -295,4 +295,5 @@ static struct fsi_driver hub_master_driver = { }; module_fsi_driver(hub_master_driver); +MODULE_DESCRIPTION("FSI hub master driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/fsi/fsi-scom.c b/drivers/fsi/fsi-scom.c index 61dbda9dbe2b..411ddc018cd8 100644 --- a/drivers/fsi/fsi-scom.c +++ b/drivers/fsi/fsi-scom.c @@ -625,4 +625,5 @@ static void scom_exit(void) module_init(scom_init); module_exit(scom_exit); +MODULE_DESCRIPTION("SCOM FSI Client device driver"); MODULE_LICENSE("GPL"); From e6cd0dc91ef9d24edda553343e64eb6b542c21dd Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 14 Jul 2024 01:48:13 +0200 Subject: [PATCH 0355/1052] eeprom: ee1004: Fix locking issues in ee1004_probe() Currently, the devres-based management of ee1004_bus_data has several issues when it comes to locking: 1. It does not call mutex_unlock() before returning an error. 2. When encountering an error, it deadlocks when trying to recursively lock a mutex. Fix this by moving the mutex-protected bus data initialization into a separate function so that devm_add_action_or_reset() is called without the mutex being held. Reported-by: Dan Carpenter Fixes: 55d57ef6fa97 ("eeprom: ee1004: Use devres for bus data cleanup") Signed-off-by: Armin Wolf Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20240713234813.21746-1-W_Armin@gmx.de Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/ee1004.c | 85 +++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/drivers/misc/eeprom/ee1004.c b/drivers/misc/eeprom/ee1004.c index d4aeeb2b2169..89224d4af4a2 100644 --- a/drivers/misc/eeprom/ee1004.c +++ b/drivers/misc/eeprom/ee1004.c @@ -233,6 +233,49 @@ static void ee1004_cleanup_bus_data(void *data) mutex_unlock(&ee1004_bus_lock); } +static int ee1004_init_bus_data(struct i2c_client *client) +{ + struct ee1004_bus_data *bd; + int err, cnr = 0; + + bd = ee1004_get_bus_data(client->adapter); + if (!bd) + return dev_err_probe(&client->dev, -ENOSPC, "Only %d busses supported", + EE1004_MAX_BUSSES); + + i2c_set_clientdata(client, bd); + + if (++bd->dev_count == 1) { + /* Use 2 dummy devices for page select command */ + for (cnr = 0; cnr < EE1004_NUM_PAGES; cnr++) { + struct i2c_client *cl; + + cl = i2c_new_dummy_device(client->adapter, EE1004_ADDR_SET_PAGE + cnr); + if (IS_ERR(cl)) { + err = PTR_ERR(cl); + goto err_out; + } + + bd->set_page[cnr] = cl; + } + + /* Remember current page to avoid unneeded page select */ + err = ee1004_get_current_page(bd); + if (err < 0) + goto err_out; + + dev_dbg(&client->dev, "Currently selected page: %d\n", err); + bd->current_page = err; + } + + return 0; + +err_out: + ee1004_cleanup(cnr, bd); + + return err; +} + static int ee1004_probe(struct i2c_client *client) { struct nvmem_config config = { @@ -251,9 +294,8 @@ static int ee1004_probe(struct i2c_client *client) .compat = true, .base_dev = &client->dev, }; - struct ee1004_bus_data *bd; struct nvmem_device *ndev; - int err, cnr = 0; + int err; /* Make sure we can operate on this adapter */ if (!i2c_check_functionality(client->adapter, @@ -264,46 +306,21 @@ static int ee1004_probe(struct i2c_client *client) mutex_lock(&ee1004_bus_lock); - bd = ee1004_get_bus_data(client->adapter); - if (!bd) { + err = ee1004_init_bus_data(client); + if (err < 0) { mutex_unlock(&ee1004_bus_lock); - return dev_err_probe(&client->dev, -ENOSPC, - "Only %d busses supported", EE1004_MAX_BUSSES); - } - - err = devm_add_action_or_reset(&client->dev, ee1004_cleanup_bus_data, bd); - if (err < 0) return err; - - i2c_set_clientdata(client, bd); - - if (++bd->dev_count == 1) { - /* Use 2 dummy devices for page select command */ - for (cnr = 0; cnr < EE1004_NUM_PAGES; cnr++) { - struct i2c_client *cl; - - cl = i2c_new_dummy_device(client->adapter, EE1004_ADDR_SET_PAGE + cnr); - if (IS_ERR(cl)) { - mutex_unlock(&ee1004_bus_lock); - return PTR_ERR(cl); - } - bd->set_page[cnr] = cl; - } - - /* Remember current page to avoid unneeded page select */ - err = ee1004_get_current_page(bd); - if (err < 0) { - mutex_unlock(&ee1004_bus_lock); - return err; - } - dev_dbg(&client->dev, "Currently selected page: %d\n", err); - bd->current_page = err; } ee1004_probe_temp_sensor(client); mutex_unlock(&ee1004_bus_lock); + err = devm_add_action_or_reset(&client->dev, ee1004_cleanup_bus_data, + i2c_get_clientdata(client)); + if (err < 0) + return err; + ndev = devm_nvmem_register(&client->dev, &config); if (IS_ERR(ndev)) return PTR_ERR(ndev); From f528cd55853968db8e959ff0e4c2c43e561e7b83 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 12:38:23 +0200 Subject: [PATCH 0356/1052] misc: mrvl-cn10k-dpi: add PCI_IOV dependency I found one more missing dependency in the new driver: when building without CONFIG_PCI_IOV, pci_sriov_configure_simple() cannot be called directly: drivers/misc/mrvl_cn10k_dpi.c: In function 'dpi_remove': include/linux/stddef.h:9:14: error: called object is not a function or function pointer 9 | #define NULL ((void *)0) | ^ include/linux/pci.h:2416:41: note: in expansion of macro 'NULL' 2416 | #define pci_sriov_configure_simple NULL | ^~~~ drivers/misc/mrvl_cn10k_dpi.c:652:9: note: in expansion of macro 'pci_sriov_configure_simple' 652 | pci_sriov_configure_simple(pdev, 0); Add this to the Kconfig file as well. Fixes: 5f67eef6dff3 ("misc: mrvl-cn10k-dpi: add Octeon CN10K DPI administrative driver") Signed-off-by: Arnd Bergmann Tested-by: Vamsi Attunuru Link: https://lore.kernel.org/r/20240719103858.1292094-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 41c3d2821a78..41c54051347a 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -587,7 +587,7 @@ config NSM config MARVELL_CN10K_DPI tristate "Octeon CN10K DPI driver" - depends on PCI + depends on PCI && PCI_IOV depends on ARCH_THUNDER || (COMPILE_TEST && 64BIT) help Enables Octeon CN10K DMA packet interface (DPI) driver which From d1009d04a0fefe4df86285cbb37c78aa0b7ab852 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 17 Jul 2024 09:17:07 -0700 Subject: [PATCH 0357/1052] char: add missing NetWinder MODULE_DESCRIPTION() macros Since commit 1fffe7a34c89 ("script: modpost: emit a warning when the description is missing"), a module without a MODULE_DESCRIPTION() will result in a warning with make W=1. The following warnings are being observed in drivers/char when CONFIG_ARCH_NETWINDER is enabled: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/ds1620.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/nwbutton.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/nwflash.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240717-md-arm-drivers-char-nw-v1-1-fee7a8505e9e@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/ds1620.c | 1 + drivers/char/nwbutton.c | 1 + drivers/char/nwflash.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c index cf89a9631107..a4f4291b4492 100644 --- a/drivers/char/ds1620.c +++ b/drivers/char/ds1620.c @@ -421,4 +421,5 @@ static void __exit ds1620_exit(void) module_init(ds1620_init); module_exit(ds1620_exit); +MODULE_DESCRIPTION("Dallas Semiconductor DS1620 thermometer driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c index ea378c0ed549..92cee5717237 100644 --- a/drivers/char/nwbutton.c +++ b/drivers/char/nwbutton.c @@ -241,6 +241,7 @@ static void __exit nwbutton_exit (void) MODULE_AUTHOR("Alex Holden"); +MODULE_DESCRIPTION("NetWinder button driver"); MODULE_LICENSE("GPL"); module_init(nwbutton_init); diff --git a/drivers/char/nwflash.c b/drivers/char/nwflash.c index 0973c2c2b01a..9f52f0306ef7 100644 --- a/drivers/char/nwflash.c +++ b/drivers/char/nwflash.c @@ -618,6 +618,7 @@ static void __exit nwflash_exit(void) iounmap((void *)FLASH_BASE); } +MODULE_DESCRIPTION("NetWinder flash memory driver"); MODULE_LICENSE("GPL"); module_param(flashdebug, bool, 0644); From 11512c197d387b59569d3a93af93de204d3bdaa6 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Mon, 22 Jul 2024 15:05:11 +0000 Subject: [PATCH 0358/1052] binder: fix descriptor lookup for context manager In commit 15d9da3f818c ("binder: use bitmap for faster descriptor lookup"), it was incorrectly assumed that references to the context manager node should always get descriptor zero assigned to them. However, if the context manager dies and a new process takes its place, then assigning descriptor zero to the new context manager might lead to collisions, as there could still be references to the older node. This issue was reported by syzbot with the following trace: kernel BUG at drivers/android/binder.c:1173! Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 447 Comm: binder-util Not tainted 6.10.0-rc6-00348-g31643d84b8c3 #10 Hardware name: linux,dummy-virt (DT) pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : binder_inc_ref_for_node+0x500/0x544 lr : binder_inc_ref_for_node+0x1e4/0x544 sp : ffff80008112b940 x29: ffff80008112b940 x28: ffff0e0e40310780 x27: 0000000000000000 x26: 0000000000000001 x25: ffff0e0e40310738 x24: ffff0e0e4089ba34 x23: ffff0e0e40310b00 x22: ffff80008112bb50 x21: ffffaf7b8f246970 x20: ffffaf7b8f773f08 x19: ffff0e0e4089b800 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 000000002de4aa60 x14: 0000000000000000 x13: 2de4acf000000000 x12: 0000000000000020 x11: 0000000000000018 x10: 0000000000000020 x9 : ffffaf7b90601000 x8 : ffff0e0e48739140 x7 : 0000000000000000 x6 : 000000000000003f x5 : ffff0e0e40310b28 x4 : 0000000000000000 x3 : ffff0e0e40310720 x2 : ffff0e0e40310728 x1 : 0000000000000000 x0 : ffff0e0e40310710 Call trace: binder_inc_ref_for_node+0x500/0x544 binder_transaction+0xf68/0x2620 binder_thread_write+0x5bc/0x139c binder_ioctl+0xef4/0x10c8 [...] This patch adds back the previous behavior of assigning the next non-zero descriptor if references to previous context managers still exist. It amends both strategies, the newer dbitmap code and also the legacy slow_desc_lookup_olocked(), by allowing them to start looking for available descriptors at a given offset. Fixes: 15d9da3f818c ("binder: use bitmap for faster descriptor lookup") Cc: stable@vger.kernel.org Reported-and-tested-by: syzbot+3dae065ca76952a67257@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000c1c0a0061d1e6979@google.com/ Reviewed-by: Alice Ryhl Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20240722150512.4192473-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 15 ++++++--------- drivers/android/dbitmap.h | 22 +++++++--------------- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index f26286e3713e..905290c98c3c 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1044,13 +1044,13 @@ static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc, } /* Find the smallest unused descriptor the "slow way" */ -static u32 slow_desc_lookup_olocked(struct binder_proc *proc) +static u32 slow_desc_lookup_olocked(struct binder_proc *proc, u32 offset) { struct binder_ref *ref; struct rb_node *n; u32 desc; - desc = 1; + desc = offset; for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (ref->data.desc > desc) @@ -1071,21 +1071,18 @@ static int get_ref_desc_olocked(struct binder_proc *proc, u32 *desc) { struct dbitmap *dmap = &proc->dmap; + unsigned int nbits, offset; unsigned long *new, bit; - unsigned int nbits; /* 0 is reserved for the context manager */ - if (node == proc->context->binder_context_mgr_node) { - *desc = 0; - return 0; - } + offset = (node == proc->context->binder_context_mgr_node) ? 0 : 1; if (!dbitmap_enabled(dmap)) { - *desc = slow_desc_lookup_olocked(proc); + *desc = slow_desc_lookup_olocked(proc, offset); return 0; } - if (dbitmap_acquire_first_zero_bit(dmap, &bit) == 0) { + if (dbitmap_acquire_next_zero_bit(dmap, offset, &bit) == 0) { *desc = bit; return 0; } diff --git a/drivers/android/dbitmap.h b/drivers/android/dbitmap.h index b8ac7b4764fd..956f1bd087d1 100644 --- a/drivers/android/dbitmap.h +++ b/drivers/android/dbitmap.h @@ -6,8 +6,7 @@ * * Used by the binder driver to optimize the allocation of the smallest * available descriptor ID. Each bit in the bitmap represents the state - * of an ID, with the exception of BIT(0) which is used exclusively to - * reference binder's context manager. + * of an ID. * * A dbitmap can grow or shrink as needed. This part has been designed * considering that users might need to briefly release their locks in @@ -58,11 +57,7 @@ static inline unsigned int dbitmap_shrink_nbits(struct dbitmap *dmap) if (bit < (dmap->nbits >> 2)) return dmap->nbits >> 1; - /* - * Note that find_last_bit() returns dmap->nbits when no bits - * are set. While this is technically not possible here since - * BIT(0) is always set, this check is left for extra safety. - */ + /* find_last_bit() returns dmap->nbits when no bits are set. */ if (bit == dmap->nbits) return NBITS_MIN; @@ -132,16 +127,17 @@ dbitmap_grow(struct dbitmap *dmap, unsigned long *new, unsigned int nbits) } /* - * Finds and sets the first zero bit in the bitmap. Upon success @bit + * Finds and sets the next zero bit in the bitmap. Upon success @bit * is populated with the index and 0 is returned. Otherwise, -ENOSPC * is returned to indicate that a dbitmap_grow() is needed. */ static inline int -dbitmap_acquire_first_zero_bit(struct dbitmap *dmap, unsigned long *bit) +dbitmap_acquire_next_zero_bit(struct dbitmap *dmap, unsigned long offset, + unsigned long *bit) { unsigned long n; - n = find_first_zero_bit(dmap->map, dmap->nbits); + n = find_next_zero_bit(dmap->map, dmap->nbits, offset); if (n == dmap->nbits) return -ENOSPC; @@ -154,9 +150,7 @@ dbitmap_acquire_first_zero_bit(struct dbitmap *dmap, unsigned long *bit) static inline void dbitmap_clear_bit(struct dbitmap *dmap, unsigned long bit) { - /* BIT(0) should always set for the context manager */ - if (bit) - clear_bit(bit, dmap->map); + clear_bit(bit, dmap->map); } static inline int dbitmap_init(struct dbitmap *dmap) @@ -168,8 +162,6 @@ static inline int dbitmap_init(struct dbitmap *dmap) } dmap->nbits = NBITS_MIN; - /* BIT(0) is reserved for the context manager */ - set_bit(0, dmap->map); return 0; } From 2c10a20f5e84ab777d29ed921d4c78d66de6d0fb Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Thu, 25 Jul 2024 11:55:10 +0530 Subject: [PATCH 0359/1052] binder_alloc: Fix sleeping function called from invalid context 36c55ce8703c ("binder_alloc: Replace kcalloc with kvcalloc to mitigate OOM issues") introduced schedule while atomic issue. [ 2689.152635][ T4275] BUG: sleeping function called from invalid context at mm/vmalloc.c:2847 [ 2689.161291][ T4275] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 4275, name: kworker/1:140 [ 2689.170708][ T4275] preempt_count: 1, expected: 0 [ 2689.175572][ T4275] RCU nest depth: 0, expected: 0 [ 2689.180521][ T4275] INFO: lockdep is turned off. [ 2689.180523][ T4275] Preemption disabled at: [ 2689.180525][ T4275] [] binder_alloc_deferred_release+0x2c/0x388 .. .. [ 2689.213419][ T4275] __might_resched+0x174/0x178 [ 2689.213423][ T4275] __might_sleep+0x48/0x7c [ 2689.213426][ T4275] vfree+0x4c/0x15c [ 2689.213430][ T4275] kvfree+0x24/0x44 [ 2689.213433][ T4275] binder_alloc_deferred_release+0x2c0/0x388 [ 2689.213436][ T4275] binder_proc_dec_tmpref+0x15c/0x2a8 [ 2689.213440][ T4275] binder_deferred_func+0xa8/0x8ec [ 2689.213442][ T4275] process_one_work+0x254/0x59c [ 2689.213447][ T4275] worker_thread+0x274/0x3ec [ 2689.213450][ T4275] kthread+0x110/0x134 [ 2689.213453][ T4275] ret_from_fork+0x10/0x20 Fix it by moving the place of kvfree outside of spinlock context. Fixes: 36c55ce8703c ("binder_alloc: Replace kcalloc with kvcalloc to mitigate OOM issues") Acked-by: Carlos Llamas Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20240725062510.2856662-1-quic_mojha@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index b00961944ab1..b3acbc4174fb 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -939,9 +939,9 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) __free_page(alloc->pages[i].page_ptr); page_count++; } - kvfree(alloc->pages); } spin_unlock(&alloc->lock); + kvfree(alloc->pages); if (alloc->mm) mmdrop(alloc->mm); From f38ba5459ced3441852f37f20fcfb7bd39d20f62 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 25 Jul 2024 09:46:32 -0700 Subject: [PATCH 0360/1052] spmi: pmic-arb: Pass the correct of_node to irq_domain_add_tree Currently, irqchips for all of the subnodes (which represent a given bus master) point to the parent wrapper node. This is no bueno, as no interrupts arrive, ever (because nothing references that node). Fix that by passing a reference to the respective master's of_node. Worth noting, this is a NOP for devices with only a single master described. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240522-topic-spmi_multi_master_irqfix-v2-1-7ec92a862b9f@linaro.org Reviewed-by: Abel Vesa Tested-by: Dmitry Baryshkov Reviewed-by: Dmitry Baryshkov Fixes: 02922ccbb330 ("spmi: pmic-arb: Register controller for bus instead of arbiter") Cc: stable@vger.kernel.org Signed-off-by: Stephen Boyd Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240725164636.3362690-3-sboyd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/spmi/spmi-pmic-arb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/spmi/spmi-pmic-arb.c b/drivers/spmi/spmi-pmic-arb.c index f240fcc5a4e1..b6880c13163c 100644 --- a/drivers/spmi/spmi-pmic-arb.c +++ b/drivers/spmi/spmi-pmic-arb.c @@ -1737,8 +1737,7 @@ static int spmi_pmic_arb_bus_init(struct platform_device *pdev, dev_dbg(&pdev->dev, "adding irq domain for bus %d\n", bus_index); - bus->domain = irq_domain_add_tree(dev->of_node, - &pmic_arb_irq_domain_ops, bus); + bus->domain = irq_domain_add_tree(node, &pmic_arb_irq_domain_ops, bus); if (!bus->domain) { dev_err(&pdev->dev, "unable to create irq_domain\n"); return -ENOMEM; From ffcf2eb4bfa24f7256de53a95182c3e3e23fdc6c Mon Sep 17 00:00:00 2001 From: David Collins Date: Thu, 25 Jul 2024 09:46:33 -0700 Subject: [PATCH 0361/1052] spmi: pmic-arb: add missing newline in dev_err format strings dev_err() format strings should end with '\n'. Several such format strings in the spmi-pmic-arb driver are missing it. Add newlines where needed. Fixes: 02922ccbb330 ("spmi: pmic-arb: Register controller for bus instead of arbiter") Signed-off-by: David Collins Link: https://lore.kernel.org/r/20240703221248.3640490-1-quic_collinsd@quicinc.com Reviewed-by: Bjorn Andersson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240725164636.3362690-4-sboyd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/spmi/spmi-pmic-arb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/spmi/spmi-pmic-arb.c b/drivers/spmi/spmi-pmic-arb.c index b6880c13163c..9ba9495fcc4b 100644 --- a/drivers/spmi/spmi-pmic-arb.c +++ b/drivers/spmi/spmi-pmic-arb.c @@ -398,7 +398,7 @@ static int pmic_arb_fmt_read_cmd(struct spmi_pmic_arb_bus *bus, u8 opc, u8 sid, *offset = rc; if (bc >= PMIC_ARB_MAX_TRANS_BYTES) { - dev_err(&bus->spmic->dev, "pmic-arb supports 1..%d bytes per trans, but:%zu requested", + dev_err(&bus->spmic->dev, "pmic-arb supports 1..%d bytes per trans, but:%zu requested\n", PMIC_ARB_MAX_TRANS_BYTES, len); return -EINVAL; } @@ -477,7 +477,7 @@ static int pmic_arb_fmt_write_cmd(struct spmi_pmic_arb_bus *bus, u8 opc, *offset = rc; if (bc >= PMIC_ARB_MAX_TRANS_BYTES) { - dev_err(&bus->spmic->dev, "pmic-arb supports 1..%d bytes per trans, but:%zu requested", + dev_err(&bus->spmic->dev, "pmic-arb supports 1..%d bytes per trans, but:%zu requested\n", PMIC_ARB_MAX_TRANS_BYTES, len); return -EINVAL; } @@ -1702,7 +1702,7 @@ static int spmi_pmic_arb_bus_init(struct platform_device *pdev, index = of_property_match_string(node, "reg-names", "cnfg"); if (index < 0) { - dev_err(dev, "cnfg reg region missing"); + dev_err(dev, "cnfg reg region missing\n"); return -EINVAL; } @@ -1712,7 +1712,7 @@ static int spmi_pmic_arb_bus_init(struct platform_device *pdev, index = of_property_match_string(node, "reg-names", "intr"); if (index < 0) { - dev_err(dev, "intr reg region missing"); + dev_err(dev, "intr reg region missing\n"); return -EINVAL; } From 15fffc6a5624b13b428bb1c6e9088e32a55eb82c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 12 Jul 2024 12:42:09 -0700 Subject: [PATCH 0362/1052] driver core: Fix uevent_show() vs driver detach race uevent_show() wants to de-reference dev->driver->name. There is no clean way for a device attribute to de-reference dev->driver unless that attribute is defined via (struct device_driver).dev_groups. Instead, the anti-pattern of taking the device_lock() in the attribute handler risks deadlocks with code paths that remove device attributes while holding the lock. This deadlock is typically invisible to lockdep given the device_lock() is marked lockdep_set_novalidate_class(), but some subsystems allocate a local lockdep key for @dev->mutex to reveal reports of the form: ====================================================== WARNING: possible circular locking dependency detected 6.10.0-rc7+ #275 Tainted: G OE N ------------------------------------------------------ modprobe/2374 is trying to acquire lock: ffff8c2270070de0 (kn->active#6){++++}-{0:0}, at: __kernfs_remove+0xde/0x220 but task is already holding lock: ffff8c22016e88f8 (&cxl_root_key){+.+.}-{3:3}, at: device_release_driver_internal+0x39/0x210 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&cxl_root_key){+.+.}-{3:3}: __mutex_lock+0x99/0xc30 uevent_show+0xac/0x130 dev_attr_show+0x18/0x40 sysfs_kf_seq_show+0xac/0xf0 seq_read_iter+0x110/0x450 vfs_read+0x25b/0x340 ksys_read+0x67/0xf0 do_syscall_64+0x75/0x190 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #0 (kn->active#6){++++}-{0:0}: __lock_acquire+0x121a/0x1fa0 lock_acquire+0xd6/0x2e0 kernfs_drain+0x1e9/0x200 __kernfs_remove+0xde/0x220 kernfs_remove_by_name_ns+0x5e/0xa0 device_del+0x168/0x410 device_unregister+0x13/0x60 devres_release_all+0xb8/0x110 device_unbind_cleanup+0xe/0x70 device_release_driver_internal+0x1c7/0x210 driver_detach+0x47/0x90 bus_remove_driver+0x6c/0xf0 cxl_acpi_exit+0xc/0x11 [cxl_acpi] __do_sys_delete_module.isra.0+0x181/0x260 do_syscall_64+0x75/0x190 entry_SYSCALL_64_after_hwframe+0x76/0x7e The observation though is that driver objects are typically much longer lived than device objects. It is reasonable to perform lockless de-reference of a @driver pointer even if it is racing detach from a device. Given the infrequency of driver unregistration, use synchronize_rcu() in module_remove_driver() to close any potential races. It is potentially overkill to suffer synchronize_rcu() just to handle the rare module removal racing uevent_show() event. Thanks to Tetsuo Handa for the debug analysis of the syzbot report [1]. Fixes: c0a40097f0bc ("drivers: core: synchronize really_probe() and dev_uevent()") Reported-by: syzbot+4762dd74e32532cda5ff@syzkaller.appspotmail.com Reported-by: Tetsuo Handa Closes: http://lore.kernel.org/5aa5558f-90a4-4864-b1b1-5d6784c5607d@I-love.SAKURA.ne.jp [1] Link: http://lore.kernel.org/669073b8ea479_5fffa294c1@dwillia2-xfh.jf.intel.com.notmuch Cc: stable@vger.kernel.org Cc: Ashish Sangwan Cc: Namjae Jeon Cc: Dirk Behme Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/172081332794.577428.9738802016494057132.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 13 ++++++++----- drivers/base/module.c | 4 ++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 730cae66607c..8c0733d3aad8 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -2640,6 +2641,7 @@ static const char *dev_uevent_name(const struct kobject *kobj) static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { const struct device *dev = kobj_to_dev(kobj); + struct device_driver *driver; int retval = 0; /* add device node properties if present */ @@ -2668,8 +2670,12 @@ static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) if (dev->type && dev->type->name) add_uevent_var(env, "DEVTYPE=%s", dev->type->name); - if (dev->driver) - add_uevent_var(env, "DRIVER=%s", dev->driver->name); + /* Synchronize with module_remove_driver() */ + rcu_read_lock(); + driver = READ_ONCE(dev->driver); + if (driver) + add_uevent_var(env, "DRIVER=%s", driver->name); + rcu_read_unlock(); /* Add common DT information about the device */ of_device_uevent(dev, env); @@ -2739,11 +2745,8 @@ static ssize_t uevent_show(struct device *dev, struct device_attribute *attr, if (!env) return -ENOMEM; - /* Synchronize with really_probe() */ - device_lock(dev); /* let the kset specific function add its keys */ retval = kset->uevent_ops->uevent(&dev->kobj, env); - device_unlock(dev); if (retval) goto out; diff --git a/drivers/base/module.c b/drivers/base/module.c index 7af224e6914a..f742ad2a21da 100644 --- a/drivers/base/module.c +++ b/drivers/base/module.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "base.h" static char *make_driver_name(const struct device_driver *drv) @@ -97,6 +98,9 @@ void module_remove_driver(const struct device_driver *drv) if (!drv) return; + /* Synchronize with dev_uevent() */ + synchronize_rcu(); + sysfs_remove_link(&drv->p->kobj, "module"); if (drv->owner) From e6ce8a28c768dbbad3f818db286cd0f4c7a921a8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 31 Jul 2024 15:05:22 +0200 Subject: [PATCH 0363/1052] ALSA: ump: Transmit RPN/NRPN message at each MSB/LSB data reception The UMP 1.1 spec says that an RPN/NRPN should be sent when one of the following occurs: * a CC 38 is received * a subsequent CC 6 is received * a CC 98, 99, 100, and 101 is received, indicating the last RPN/NRPN message has ended and a new one has started That said, we should send a partial data even if it's not fully filled. Let's change the UMP conversion helper code to follow that rule. Link: https://patch.msgid.link/20240731130528.12600-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- include/sound/ump_convert.h | 1 + sound/core/ump_convert.c | 49 ++++++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/include/sound/ump_convert.h b/include/sound/ump_convert.h index 28c364c63245..d099ae27f849 100644 --- a/include/sound/ump_convert.h +++ b/include/sound/ump_convert.h @@ -13,6 +13,7 @@ struct ump_cvt_to_ump_bank { unsigned char cc_nrpn_msb, cc_nrpn_lsb; unsigned char cc_data_msb, cc_data_lsb; unsigned char cc_bank_msb, cc_bank_lsb; + bool cc_data_msb_set, cc_data_lsb_set; }; /* context for converting from MIDI1 byte stream to UMP packet */ diff --git a/sound/core/ump_convert.c b/sound/core/ump_convert.c index f67c44c83fde..5d1b85e7ac16 100644 --- a/sound/core/ump_convert.c +++ b/sound/core/ump_convert.c @@ -287,25 +287,37 @@ static int cvt_legacy_system_to_ump(struct ump_cvt_to_ump *cvt, return 4; } -static void fill_rpn(struct ump_cvt_to_ump_bank *cc, - union snd_ump_midi2_msg *midi2) +static int fill_rpn(struct ump_cvt_to_ump_bank *cc, + union snd_ump_midi2_msg *midi2, + bool flush) { + if (!(cc->cc_data_lsb_set || cc->cc_data_msb_set)) + return 0; // skip + /* when not flushing, wait for complete data set */ + if (!flush && (!cc->cc_data_lsb_set || !cc->cc_data_msb_set)) + return 0; // skip + if (cc->rpn_set) { midi2->rpn.status = UMP_MSG_STATUS_RPN; midi2->rpn.bank = cc->cc_rpn_msb; midi2->rpn.index = cc->cc_rpn_lsb; - cc->rpn_set = 0; - cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; - } else { + } else if (cc->nrpn_set) { midi2->rpn.status = UMP_MSG_STATUS_NRPN; midi2->rpn.bank = cc->cc_nrpn_msb; midi2->rpn.index = cc->cc_nrpn_lsb; - cc->nrpn_set = 0; - cc->cc_nrpn_msb = cc->cc_nrpn_lsb = 0; + } else { + return 0; // skip } + midi2->rpn.data = upscale_14_to_32bit((cc->cc_data_msb << 7) | cc->cc_data_lsb); + + cc->rpn_set = 0; + cc->nrpn_set = 0; + cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; cc->cc_data_msb = cc->cc_data_lsb = 0; + cc->cc_data_msb_set = cc->cc_data_lsb_set = 0; + return 1; } /* convert to a MIDI 1.0 Channel Voice message */ @@ -318,6 +330,7 @@ static int cvt_legacy_cmd_to_ump(struct ump_cvt_to_ump *cvt, struct ump_cvt_to_ump_bank *cc; union snd_ump_midi2_msg *midi2 = (union snd_ump_midi2_msg *)data; unsigned char status, channel; + int ret; BUILD_BUG_ON(sizeof(union snd_ump_midi1_msg) != 4); BUILD_BUG_ON(sizeof(union snd_ump_midi2_msg) != 8); @@ -358,24 +371,29 @@ static int cvt_legacy_cmd_to_ump(struct ump_cvt_to_ump *cvt, case UMP_MSG_STATUS_CC: switch (buf[1]) { case UMP_CC_RPN_MSB: + ret = fill_rpn(cc, midi2, true); cc->rpn_set = 1; cc->cc_rpn_msb = buf[2]; - return 0; // skip + return ret; case UMP_CC_RPN_LSB: + ret = fill_rpn(cc, midi2, true); cc->rpn_set = 1; cc->cc_rpn_lsb = buf[2]; - return 0; // skip + return ret; case UMP_CC_NRPN_MSB: + ret = fill_rpn(cc, midi2, true); cc->nrpn_set = 1; cc->cc_nrpn_msb = buf[2]; - return 0; // skip + return ret; case UMP_CC_NRPN_LSB: + ret = fill_rpn(cc, midi2, true); cc->nrpn_set = 1; cc->cc_nrpn_lsb = buf[2]; - return 0; // skip + return ret; case UMP_CC_DATA: + cc->cc_data_msb_set = 1; cc->cc_data_msb = buf[2]; - return 0; // skip + return fill_rpn(cc, midi2, false); case UMP_CC_BANK_SELECT: cc->bank_set = 1; cc->cc_bank_msb = buf[2]; @@ -385,12 +403,9 @@ static int cvt_legacy_cmd_to_ump(struct ump_cvt_to_ump *cvt, cc->cc_bank_lsb = buf[2]; return 0; // skip case UMP_CC_DATA_LSB: + cc->cc_data_lsb_set = 1; cc->cc_data_lsb = buf[2]; - if (cc->rpn_set || cc->nrpn_set) - fill_rpn(cc, midi2); - else - return 0; // skip - break; + return fill_rpn(cc, midi2, false); default: midi2->cc.index = buf[1]; midi2->cc.data = upscale_7_to_32bit(buf[2]); From 50a6dd19dca9446475f023eaa652016bfe5b1cbe Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 31 Jul 2024 15:05:23 +0200 Subject: [PATCH 0364/1052] ALSA: ump: Explicitly reset RPN with Null RPN RPN with 127:127 is treated as a Null RPN, just to reset the parameters, and it's not translated to MIDI2. Although the current code can work as is in most cases, better to implement the RPN reset explicitly for Null message. Link: https://patch.msgid.link/20240731130528.12600-3-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump_convert.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/sound/core/ump_convert.c b/sound/core/ump_convert.c index 5d1b85e7ac16..0fe13d031656 100644 --- a/sound/core/ump_convert.c +++ b/sound/core/ump_convert.c @@ -287,6 +287,15 @@ static int cvt_legacy_system_to_ump(struct ump_cvt_to_ump *cvt, return 4; } +static void reset_rpn(struct ump_cvt_to_ump_bank *cc) +{ + cc->rpn_set = 0; + cc->nrpn_set = 0; + cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; + cc->cc_data_msb = cc->cc_data_lsb = 0; + cc->cc_data_msb_set = cc->cc_data_lsb_set = 0; +} + static int fill_rpn(struct ump_cvt_to_ump_bank *cc, union snd_ump_midi2_msg *midi2, bool flush) @@ -312,11 +321,7 @@ static int fill_rpn(struct ump_cvt_to_ump_bank *cc, midi2->rpn.data = upscale_14_to_32bit((cc->cc_data_msb << 7) | cc->cc_data_lsb); - cc->rpn_set = 0; - cc->nrpn_set = 0; - cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; - cc->cc_data_msb = cc->cc_data_lsb = 0; - cc->cc_data_msb_set = cc->cc_data_lsb_set = 0; + reset_rpn(cc); return 1; } @@ -374,11 +379,15 @@ static int cvt_legacy_cmd_to_ump(struct ump_cvt_to_ump *cvt, ret = fill_rpn(cc, midi2, true); cc->rpn_set = 1; cc->cc_rpn_msb = buf[2]; + if (cc->cc_rpn_msb == 0x7f && cc->cc_rpn_lsb == 0x7f) + reset_rpn(cc); return ret; case UMP_CC_RPN_LSB: ret = fill_rpn(cc, midi2, true); cc->rpn_set = 1; cc->cc_rpn_lsb = buf[2]; + if (cc->cc_rpn_msb == 0x7f && cc->cc_rpn_lsb == 0x7f) + reset_rpn(cc); return ret; case UMP_CC_NRPN_MSB: ret = fill_rpn(cc, midi2, true); From a683030606fa5ff8b722a5e28839d19288011ede Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 31 Jul 2024 15:05:24 +0200 Subject: [PATCH 0365/1052] ALSA: seq: ump: Use the common RPN/bank conversion context The UMP core conversion helper API already defines the context needed to record the bank and RPN/NRPN values, and we can simply re-use the same struct instead of re-defining the same content as a different name. Link: https://patch.msgid.link/20240731130528.12600-4-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ports.h | 14 ++------------ sound/core/seq/seq_ump_convert.c | 10 +++++----- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/sound/core/seq/seq_ports.h b/sound/core/seq/seq_ports.h index b111382f697a..9e36738c0dd0 100644 --- a/sound/core/seq/seq_ports.h +++ b/sound/core/seq/seq_ports.h @@ -7,6 +7,7 @@ #define __SND_SEQ_PORTS_H #include +#include #include "seq_lock.h" /* list of 'exported' ports */ @@ -42,17 +43,6 @@ struct snd_seq_port_subs_info { int (*close)(void *private_data, struct snd_seq_port_subscribe *info); }; -/* context for converting from legacy control event to UMP packet */ -struct snd_seq_ump_midi2_bank { - bool rpn_set; - bool nrpn_set; - bool bank_set; - unsigned char cc_rpn_msb, cc_rpn_lsb; - unsigned char cc_nrpn_msb, cc_nrpn_lsb; - unsigned char cc_data_msb, cc_data_lsb; - unsigned char cc_bank_msb, cc_bank_lsb; -}; - struct snd_seq_client_port { struct snd_seq_addr addr; /* client/port number */ @@ -88,7 +78,7 @@ struct snd_seq_client_port { unsigned char ump_group; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) - struct snd_seq_ump_midi2_bank midi2_bank[16]; /* per channel */ + struct ump_cvt_to_ump_bank midi2_bank[16]; /* per channel */ #endif }; diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index d9dacfbe4a9a..b1bc6d122d92 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -368,7 +368,7 @@ static int cvt_ump_midi1_to_midi2(struct snd_seq_client *dest, struct snd_seq_ump_event ev_cvt; const union snd_ump_midi1_msg *midi1 = (const union snd_ump_midi1_msg *)event->ump; union snd_ump_midi2_msg *midi2 = (union snd_ump_midi2_msg *)ev_cvt.ump; - struct snd_seq_ump_midi2_bank *cc; + struct ump_cvt_to_ump_bank *cc; ev_cvt = *event; memset(&ev_cvt.ump, 0, sizeof(ev_cvt.ump)); @@ -790,7 +790,7 @@ static int paf_ev_to_ump_midi2(const struct snd_seq_event *event, } /* set up the MIDI2 RPN/NRPN packet data from the parsed info */ -static void fill_rpn(struct snd_seq_ump_midi2_bank *cc, +static void fill_rpn(struct ump_cvt_to_ump_bank *cc, union snd_ump_midi2_msg *data, unsigned char channel) { @@ -822,7 +822,7 @@ static int cc_ev_to_ump_midi2(const struct snd_seq_event *event, unsigned char channel = event->data.control.channel & 0x0f; unsigned char index = event->data.control.param & 0x7f; unsigned char val = event->data.control.value & 0x7f; - struct snd_seq_ump_midi2_bank *cc = &dest_port->midi2_bank[channel]; + struct ump_cvt_to_ump_bank *cc = &dest_port->midi2_bank[channel]; /* process special CC's (bank/rpn/nrpn) */ switch (index) { @@ -887,7 +887,7 @@ static int pgm_ev_to_ump_midi2(const struct snd_seq_event *event, unsigned char status) { unsigned char channel = event->data.control.channel & 0x0f; - struct snd_seq_ump_midi2_bank *cc = &dest_port->midi2_bank[channel]; + struct ump_cvt_to_ump_bank *cc = &dest_port->midi2_bank[channel]; data->pg.status = status; data->pg.channel = channel; @@ -924,7 +924,7 @@ static int ctrl14_ev_to_ump_midi2(const struct snd_seq_event *event, { unsigned char channel = event->data.control.channel & 0x0f; unsigned char index = event->data.control.param & 0x7f; - struct snd_seq_ump_midi2_bank *cc = &dest_port->midi2_bank[channel]; + struct ump_cvt_to_ump_bank *cc = &dest_port->midi2_bank[channel]; unsigned char msb, lsb; msb = (event->data.control.value >> 7) & 0x7f; From a4ff92ff0bdd731eca9f0b50b1cbb5aba89be4b2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 31 Jul 2024 15:05:25 +0200 Subject: [PATCH 0366/1052] ALSA: seq: ump: Transmit RPN/NRPN message at each MSB/LSB data reception Just like the core UMP conversion helper, we need to deal with the partially-filled RPN/NRPN data in the sequencer UMP converter as well. Link: https://patch.msgid.link/20240731130528.12600-5-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 74 +++++++++++++++++++------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index b1bc6d122d92..7ca62667f28d 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -790,27 +790,39 @@ static int paf_ev_to_ump_midi2(const struct snd_seq_event *event, } /* set up the MIDI2 RPN/NRPN packet data from the parsed info */ -static void fill_rpn(struct ump_cvt_to_ump_bank *cc, - union snd_ump_midi2_msg *data, - unsigned char channel) +static int fill_rpn(struct ump_cvt_to_ump_bank *cc, + union snd_ump_midi2_msg *data, + unsigned char channel, + bool flush) { + if (!(cc->cc_data_lsb_set || cc->cc_data_msb_set)) + return 0; // skip + /* when not flushing, wait for complete data set */ + if (!flush && (!cc->cc_data_lsb_set || !cc->cc_data_msb_set)) + return 0; // skip + if (cc->rpn_set) { data->rpn.status = UMP_MSG_STATUS_RPN; data->rpn.bank = cc->cc_rpn_msb; data->rpn.index = cc->cc_rpn_lsb; - cc->rpn_set = 0; - cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; - } else { + } else if (cc->nrpn_set) { data->rpn.status = UMP_MSG_STATUS_NRPN; data->rpn.bank = cc->cc_nrpn_msb; data->rpn.index = cc->cc_nrpn_lsb; - cc->nrpn_set = 0; - cc->cc_nrpn_msb = cc->cc_nrpn_lsb = 0; + } else { + return 0; // skip } + data->rpn.data = upscale_14_to_32bit((cc->cc_data_msb << 7) | cc->cc_data_lsb); data->rpn.channel = channel; + + cc->rpn_set = 0; + cc->nrpn_set = 0; + cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; cc->cc_data_msb = cc->cc_data_lsb = 0; + cc->cc_data_msb_set = cc->cc_data_lsb_set = 0; + return 1; } /* convert CC event to MIDI 2.0 UMP */ @@ -823,28 +835,34 @@ static int cc_ev_to_ump_midi2(const struct snd_seq_event *event, unsigned char index = event->data.control.param & 0x7f; unsigned char val = event->data.control.value & 0x7f; struct ump_cvt_to_ump_bank *cc = &dest_port->midi2_bank[channel]; + int ret; /* process special CC's (bank/rpn/nrpn) */ switch (index) { case UMP_CC_RPN_MSB: + ret = fill_rpn(cc, data, channel, true); cc->rpn_set = 1; cc->cc_rpn_msb = val; - return 0; // skip + return ret; case UMP_CC_RPN_LSB: + ret = fill_rpn(cc, data, channel, true); cc->rpn_set = 1; cc->cc_rpn_lsb = val; - return 0; // skip + return ret; case UMP_CC_NRPN_MSB: + ret = fill_rpn(cc, data, channel, true); cc->nrpn_set = 1; cc->cc_nrpn_msb = val; - return 0; // skip + return ret; case UMP_CC_NRPN_LSB: + ret = fill_rpn(cc, data, channel, true); cc->nrpn_set = 1; cc->cc_nrpn_lsb = val; - return 0; // skip + return ret; case UMP_CC_DATA: + cc->cc_data_msb_set = 1; cc->cc_data_msb = val; - return 0; // skip + return fill_rpn(cc, data, channel, false); case UMP_CC_BANK_SELECT: cc->bank_set = 1; cc->cc_bank_msb = val; @@ -854,11 +872,9 @@ static int cc_ev_to_ump_midi2(const struct snd_seq_event *event, cc->cc_bank_lsb = val; return 0; // skip case UMP_CC_DATA_LSB: + cc->cc_data_lsb_set = 1; cc->cc_data_lsb = val; - if (!(cc->rpn_set || cc->nrpn_set)) - return 0; // skip - fill_rpn(cc, data, channel); - return 1; + return fill_rpn(cc, data, channel, false); } data->cc.status = status; @@ -926,6 +942,7 @@ static int ctrl14_ev_to_ump_midi2(const struct snd_seq_event *event, unsigned char index = event->data.control.param & 0x7f; struct ump_cvt_to_ump_bank *cc = &dest_port->midi2_bank[channel]; unsigned char msb, lsb; + int ret; msb = (event->data.control.value >> 7) & 0x7f; lsb = event->data.control.value & 0x7f; @@ -939,28 +956,25 @@ static int ctrl14_ev_to_ump_midi2(const struct snd_seq_event *event, cc->cc_bank_lsb = lsb; return 0; // skip case UMP_CC_RPN_MSB: - cc->cc_rpn_msb = msb; - fallthrough; case UMP_CC_RPN_LSB: - cc->rpn_set = 1; + ret = fill_rpn(cc, data, channel, true); + cc->cc_rpn_msb = msb; cc->cc_rpn_lsb = lsb; - return 0; // skip + cc->rpn_set = 1; + return ret; case UMP_CC_NRPN_MSB: - cc->cc_nrpn_msb = msb; - fallthrough; case UMP_CC_NRPN_LSB: + ret = fill_rpn(cc, data, channel, true); + cc->cc_nrpn_msb = msb; cc->nrpn_set = 1; cc->cc_nrpn_lsb = lsb; - return 0; // skip + return ret; case UMP_CC_DATA: - cc->cc_data_msb = msb; - fallthrough; case UMP_CC_DATA_LSB: + cc->cc_data_msb_set = cc->cc_data_lsb_set = 1; + cc->cc_data_msb = msb; cc->cc_data_lsb = lsb; - if (!(cc->rpn_set || cc->nrpn_set)) - return 0; // skip - fill_rpn(cc, data, channel); - return 1; + return fill_rpn(cc, data, channel, false); } data->cc.status = UMP_MSG_STATUS_CC; From 98ea612dd1150adb61cd2a0e93875e1cc77e6b87 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 31 Jul 2024 15:05:26 +0200 Subject: [PATCH 0367/1052] ALSA: seq: ump: Explicitly reset RPN with Null RPN RPN with 127:127 is treated as a Null RPN, just to reset the parameters, and it's not translated to MIDI2. Although the current code can work as is in most cases, better to implement the RPN reset explicitly for Null message. Link: https://patch.msgid.link/20240731130528.12600-6-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index 7ca62667f28d..4dd540cbb1cb 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -789,6 +789,15 @@ static int paf_ev_to_ump_midi2(const struct snd_seq_event *event, return 1; } +static void reset_rpn(struct ump_cvt_to_ump_bank *cc) +{ + cc->rpn_set = 0; + cc->nrpn_set = 0; + cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; + cc->cc_data_msb = cc->cc_data_lsb = 0; + cc->cc_data_msb_set = cc->cc_data_lsb_set = 0; +} + /* set up the MIDI2 RPN/NRPN packet data from the parsed info */ static int fill_rpn(struct ump_cvt_to_ump_bank *cc, union snd_ump_midi2_msg *data, @@ -817,11 +826,7 @@ static int fill_rpn(struct ump_cvt_to_ump_bank *cc, cc->cc_data_lsb); data->rpn.channel = channel; - cc->rpn_set = 0; - cc->nrpn_set = 0; - cc->cc_rpn_msb = cc->cc_rpn_lsb = 0; - cc->cc_data_msb = cc->cc_data_lsb = 0; - cc->cc_data_msb_set = cc->cc_data_lsb_set = 0; + reset_rpn(cc); return 1; } @@ -843,11 +848,15 @@ static int cc_ev_to_ump_midi2(const struct snd_seq_event *event, ret = fill_rpn(cc, data, channel, true); cc->rpn_set = 1; cc->cc_rpn_msb = val; + if (cc->cc_rpn_msb == 0x7f && cc->cc_rpn_lsb == 0x7f) + reset_rpn(cc); return ret; case UMP_CC_RPN_LSB: ret = fill_rpn(cc, data, channel, true); cc->rpn_set = 1; cc->cc_rpn_lsb = val; + if (cc->cc_rpn_msb == 0x7f && cc->cc_rpn_lsb == 0x7f) + reset_rpn(cc); return ret; case UMP_CC_NRPN_MSB: ret = fill_rpn(cc, data, channel, true); @@ -961,6 +970,8 @@ static int ctrl14_ev_to_ump_midi2(const struct snd_seq_event *event, cc->cc_rpn_msb = msb; cc->cc_rpn_lsb = lsb; cc->rpn_set = 1; + if (cc->cc_rpn_msb == 0x7f && cc->cc_rpn_lsb == 0x7f) + reset_rpn(cc); return ret; case UMP_CC_NRPN_MSB: case UMP_CC_NRPN_LSB: From 00af4f3dda1461ec90d892edc10bec6d3c50c554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Mon, 15 Jul 2024 12:44:53 +0200 Subject: [PATCH 0368/1052] USB: serial: debug: do not echo input by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This driver is intended as a "client" end of the console connection. When connected to a host it's supposed to receive debug logs, and possibly allow to interact with whatever debug console is available there. Feeding messages back, depending on a configuration may cause log messages be executed as shell commands (which can be really bad if one is unlucky, imagine a log message like "prevented running `rm -rf /home`"). In case of Xen, it exposes sysrq-like debug interface, and feeding it its own logs will pretty quickly hit 'R' for "instant reboot". Contrary to a classic serial console, the USB one cannot be configured ahead of time, as the device shows up only when target OS is up. And at the time device is opened to execute relevant ioctl, it's already too late, especially when logs start flowing shortly after device is initialized. Avoid the issue by changing default to no echo for this type of devices. Signed-off-by: Marek Marczykowski-Górecki [ johan: amend summary; disable also ECHONL ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/usb_debug.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c index 6934970f180d..5a8869cd95d5 100644 --- a/drivers/usb/serial/usb_debug.c +++ b/drivers/usb/serial/usb_debug.c @@ -76,6 +76,11 @@ static void usb_debug_process_read_urb(struct urb *urb) usb_serial_generic_process_read_urb(urb); } +static void usb_debug_init_termios(struct tty_struct *tty) +{ + tty->termios.c_lflag &= ~(ECHO | ECHONL); +} + static struct usb_serial_driver debug_device = { .driver = { .owner = THIS_MODULE, @@ -85,6 +90,7 @@ static struct usb_serial_driver debug_device = { .num_ports = 1, .bulk_out_size = USB_DEBUG_MAX_PACKET_SIZE, .break_ctl = usb_debug_break_ctl, + .init_termios = usb_debug_init_termios, .process_read_urb = usb_debug_process_read_urb, }; @@ -96,6 +102,7 @@ static struct usb_serial_driver dbc_device = { .id_table = dbc_id_table, .num_ports = 1, .break_ctl = usb_debug_break_ctl, + .init_termios = usb_debug_init_termios, .process_read_urb = usb_debug_process_read_urb, }; From 9da8aa3b3ca05b22be5ba312771e6df4366e56cc Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Wed, 31 Jul 2024 13:48:28 +0200 Subject: [PATCH 0369/1052] ASoC: nau8822: Lower debug print priority NAU8822 codec PLL parameters are not an information that the general user should care about, this print is supposed to be used for debugging, adjust the debug print priority accordingly. Signed-off-by: Francesco Dolcini Link: https://patch.msgid.link/20240731114828.61238-1-francesco@dolcini.it Signed-off-by: Mark Brown --- sound/soc/codecs/nau8822.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/nau8822.c b/sound/soc/codecs/nau8822.c index e1cbaf8a944d..fd4a96a12060 100644 --- a/sound/soc/codecs/nau8822.c +++ b/sound/soc/codecs/nau8822.c @@ -736,7 +736,7 @@ static int nau8822_set_pll(struct snd_soc_dai *dai, int pll_id, int source, return ret; } - dev_info(component->dev, + dev_dbg(component->dev, "pll_int=%x pll_frac=%x mclk_scaler=%x pre_factor=%x\n", pll_param->pll_int, pll_param->pll_frac, pll_param->mclk_scaler, pll_param->pre_factor); From b7b7e1ab7619deb3b299b5e5c619c3e6f183a12d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 31 Jul 2024 16:19:41 +0200 Subject: [PATCH 0370/1052] ALSA: usb-audio: Correct surround channels in UAC1 channel map USB-audio driver puts SNDRV_CHMAP_SL and _SR as left and right surround channels for UAC1 channel map, respectively. But they should have been SNDRV_CHMAP_RL and _RR; the current value *_SL and _SR are rather "side" channels, not "surround". I guess I took those mistakenly when I read the spec mentioning "surround left". This patch corrects those entries to be the right channels. Suggested-by: Sylvain BERTRAND Closes: https://lore.kernel.orgZ/qIyJD8lhd8hFhlC@freedom Fixes: 04324ccc75f9 ("ALSA: usb-audio: add channel map support") Cc: Link: https://patch.msgid.link/20240731142018.24750-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/stream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/usb/stream.c b/sound/usb/stream.c index d5409f387945..e14c725acebf 100644 --- a/sound/usb/stream.c +++ b/sound/usb/stream.c @@ -244,8 +244,8 @@ static struct snd_pcm_chmap_elem *convert_chmap(int channels, unsigned int bits, SNDRV_CHMAP_FR, /* right front */ SNDRV_CHMAP_FC, /* center front */ SNDRV_CHMAP_LFE, /* LFE */ - SNDRV_CHMAP_SL, /* left surround */ - SNDRV_CHMAP_SR, /* right surround */ + SNDRV_CHMAP_RL, /* left surround */ + SNDRV_CHMAP_RR, /* right surround */ SNDRV_CHMAP_FLC, /* left of center */ SNDRV_CHMAP_FRC, /* right of center */ SNDRV_CHMAP_RC, /* surround */ From 4734406c39238cbeafe66f0060084caa3247ff53 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 25 Jul 2024 11:31:52 +0200 Subject: [PATCH 0371/1052] s390/fpu: Re-add exception handling in load_fpu_state() With the recent rewrite of the fpu code exception handling for the lfpc instruction within load_fpu_state() was erroneously removed. Add it again to prevent that loading invalid floating point register values cause an unhandled specification exception. Fixes: 8c09871a950a ("s390/fpu: limit save and restore to used registers") Cc: stable@vger.kernel.org Reported-by: Aristeu Rozanski Tested-by: Aristeu Rozanski Reviewed-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/fpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index fa90bbdc5ef9..6f2e87920288 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -113,7 +113,7 @@ void load_fpu_state(struct fpu *state, int flags) int mask; if (flags & KERNEL_FPC) - fpu_lfpc(&state->fpc); + fpu_lfpc_safe(&state->fpc); if (!cpu_has_vx()) { if (flags & KERNEL_VXR_V0V7) load_fp_regs_vx(state->vxrs); From 0a34c027a3dd756bdc17762db3acaeacdabacaf0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 27 Jul 2024 19:34:27 +0200 Subject: [PATCH 0372/1052] s390/alternatives: Remove unused empty header file Remove the unused and empty arch/s390/kernel/alternative.h header file which was added by mistake. Fixes: 5ade5be4edf8 ("s390: Add infrastructure to patch lowcore accesses") Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/alternative.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 arch/s390/kernel/alternative.h diff --git a/arch/s390/kernel/alternative.h b/arch/s390/kernel/alternative.h deleted file mode 100644 index e69de29bb2d1..000000000000 From c8e4d73eae835a1b9f6fc29f2577770ea8ca0c03 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 15 Jul 2024 08:58:51 -0700 Subject: [PATCH 0373/1052] s390/cio: Add missing MODULE_DESCRIPTION() macros With ARCH=s390, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/s390/cio/ccwgroup.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/s390/cio/vfio_ccw.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Reviewed-by: Eric Farman Reviewed-by: Vineeth Vijayan Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240715-md-s390-drivers-s390-cio-v2-1-97eaa6971124@quicinc.com Signed-off-by: Vasily Gorbik --- drivers/s390/cio/ccwgroup.c | 1 + drivers/s390/cio/vfio_ccw_drv.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index b72f672a7720..66b1bdc63284 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -550,4 +550,5 @@ void ccwgroup_remove_ccwdev(struct ccw_device *cdev) put_device(&gdev->dev); } EXPORT_SYMBOL(ccwgroup_remove_ccwdev); +MODULE_DESCRIPTION("ccwgroup bus driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 8ad49030a7bf..914dde041675 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -488,4 +488,5 @@ static void __exit vfio_ccw_sch_exit(void) module_init(vfio_ccw_sch_init); module_exit(vfio_ccw_sch_exit); +MODULE_DESCRIPTION("VFIO based Subchannel device driver"); MODULE_LICENSE("GPL v2"); From 373953444ce542db43535861fb8ebf3a1e05669c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Jul 2024 20:49:53 +0200 Subject: [PATCH 0374/1052] s390/mm/ptdump: Fix handling of identity mapping area Since virtual and real addresses are not the same anymore the assumption that the kernel image is contained within the identity mapping is also not true anymore. Fix this by adding two explicit areas and at the correct locations: one for the 8kb lowcore area, and one for the identity mapping. Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/dump_pagetables.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 98dab3e049de..ea1e4b92a1c9 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -20,8 +20,8 @@ struct addr_marker { }; enum address_markers_idx { - IDENTITY_BEFORE_NR = 0, - IDENTITY_BEFORE_END_NR, + LOWCORE_START_NR = 0, + LOWCORE_END_NR, AMODE31_START_NR, AMODE31_END_NR, KERNEL_START_NR, @@ -30,8 +30,8 @@ enum address_markers_idx { KFENCE_START_NR, KFENCE_END_NR, #endif - IDENTITY_AFTER_NR, - IDENTITY_AFTER_END_NR, + IDENTITY_START_NR, + IDENTITY_END_NR, VMEMMAP_NR, VMEMMAP_END_NR, VMALLOC_NR, @@ -59,8 +59,10 @@ enum address_markers_idx { }; static struct addr_marker address_markers[] = { - [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, - [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, + [LOWCORE_START_NR] = {0, "Lowcore Start"}, + [LOWCORE_END_NR] = {0, "Lowcore End"}, + [IDENTITY_START_NR] = {0, "Identity Mapping Start"}, + [IDENTITY_END_NR] = {0, "Identity Mapping End"}, [AMODE31_START_NR] = {0, "Amode31 Area Start"}, [AMODE31_END_NR] = {0, "Amode31 Area End"}, [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, @@ -69,8 +71,6 @@ static struct addr_marker address_markers[] = { [KFENCE_START_NR] = {0, "KFence Pool Start"}, [KFENCE_END_NR] = {0, "KFence Pool End"}, #endif - [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, - [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, [VMEMMAP_NR] = {0, "vmemmap Area Start"}, [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, [VMALLOC_NR] = {0, "vmalloc Area Start"}, @@ -310,7 +310,10 @@ static int pt_dump_init(void) */ max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); - address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; + address_markers[LOWCORE_START_NR].start_address = 0; + address_markers[LOWCORE_END_NR].start_address = sizeof(struct lowcore); + address_markers[IDENTITY_START_NR].start_address = __identity_base; + address_markers[IDENTITY_END_NR].start_address = __identity_base + ident_map_size; address_markers[AMODE31_START_NR].start_address = (unsigned long)__samode31; address_markers[AMODE31_END_NR].start_address = (unsigned long)__eamode31; address_markers[MODULES_NR].start_address = MODULES_VADDR; From 7e4d4cfed6483303436687f1db9a28a377c6ab2a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Jul 2024 20:49:54 +0200 Subject: [PATCH 0375/1052] s390/mm/ptdump: Add support for relocated lowcore mapping The page table dumper contains a hard coded assumption that the first mapped area starts at address zero. With a relocated lowcore this is not true anymore. Subsequently the first entry (lowcore) is printed as if it would contain everything from address zero until the end of the location of the lowcore area. Fix this by adding a single "Kernel Virtual Address Space" entry, which always starts at address zero. It ends when the lowcore area starts which is either address zero, or its relocated address. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/dump_pagetables.c | 35 +++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index ea1e4b92a1c9..91e0cf7e0f94 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -20,7 +20,8 @@ struct addr_marker { }; enum address_markers_idx { - LOWCORE_START_NR = 0, + KVA_NR = 0, + LOWCORE_START_NR, LOWCORE_END_NR, AMODE31_START_NR, AMODE31_END_NR, @@ -59,6 +60,7 @@ enum address_markers_idx { }; static struct addr_marker address_markers[] = { + [KVA_NR] = {0, "Kernel Virtual Address Space"}, [LOWCORE_START_NR] = {0, "Lowcore Start"}, [LOWCORE_END_NR] = {0, "Lowcore End"}, [IDENTITY_START_NR] = {0, "Identity Mapping Start"}, @@ -163,6 +165,19 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } +static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level) +{ + struct seq_file *m = st->seq; + + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + st->start_address = addr; + st->current_prot = prot; + st->level = level; +} + static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { int width = sizeof(unsigned long) * 2; @@ -186,9 +201,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, addr = max_addr; if (st->level == -1) { pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); - st->start_address = addr; - st->current_prot = prot; - st->level = level; + note_page_update_state(st, addr, prot, level); } else if (prot != st->current_prot || level != st->level || addr >= st->marker[1].start_address) { note_prot_wx(st, addr); @@ -202,13 +215,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - while (addr >= st->marker[1].start_address) { - st->marker++; - pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); - } - st->start_address = addr; - st->current_prot = prot; - st->level = level; + note_page_update_state(st, addr, prot, level); } } @@ -303,6 +310,8 @@ static int pt_dump_init(void) #ifdef CONFIG_KFENCE unsigned long kfence_start = (unsigned long)__kfence_pool; #endif + unsigned long lowcore = (unsigned long)get_lowcore(); + /* * Figure out the maximum virtual address being accessible with the * kernel ASCE. We need this to keep the page table walker functions @@ -310,8 +319,8 @@ static int pt_dump_init(void) */ max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); - address_markers[LOWCORE_START_NR].start_address = 0; - address_markers[LOWCORE_END_NR].start_address = sizeof(struct lowcore); + address_markers[LOWCORE_START_NR].start_address = lowcore; + address_markers[LOWCORE_END_NR].start_address = lowcore + sizeof(struct lowcore); address_markers[IDENTITY_START_NR].start_address = __identity_base; address_markers[IDENTITY_END_NR].start_address = __identity_base + ident_map_size; address_markers[AMODE31_START_NR].start_address = (unsigned long)__samode31; From 7e12284c523be33f34d43b2bf8627ab0c8af2388 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Jul 2024 20:49:55 +0200 Subject: [PATCH 0376/1052] s390/mm/ptdump: Improve sorting of markers Use the sort() from lib/sort.c to sort markers instead of the private implementation. The current implementation does not sort markers properly if they have to be moved downwards: ---[ Real Memory Copy Area Start ]--- 0x0000035b903ff000-0x0000035b90400000 4K PTE I ---[ vmalloc Area Start ]--- ---[ Real Memory Copy Area End ]--- Add a new member to each marker which indicates if a marker is start of an area. If addresses of areas are equal consider an address which defines the start of an area higher than the address which defines the end of an area. In result the output is sorted as intended: ---[ Real Memory Copy Area Start ]--- 0x0000019cedcff000-0x0000019cedd00000 4K PTE I ---[ Real Memory Copy Area End ]--- ---[ vmalloc Area Start ]--- Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/dump_pagetables.c | 100 +++++++++++++++++---------------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 91e0cf7e0f94..0a67fcee4414 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ static unsigned long max_addr; struct addr_marker { + int is_start; unsigned long start_address; const char *name; }; @@ -60,44 +62,44 @@ enum address_markers_idx { }; static struct addr_marker address_markers[] = { - [KVA_NR] = {0, "Kernel Virtual Address Space"}, - [LOWCORE_START_NR] = {0, "Lowcore Start"}, - [LOWCORE_END_NR] = {0, "Lowcore End"}, - [IDENTITY_START_NR] = {0, "Identity Mapping Start"}, - [IDENTITY_END_NR] = {0, "Identity Mapping End"}, - [AMODE31_START_NR] = {0, "Amode31 Area Start"}, - [AMODE31_END_NR] = {0, "Amode31 Area End"}, - [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, - [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, + [KVA_NR] = {0, 0, "Kernel Virtual Address Space"}, + [LOWCORE_START_NR] = {1, 0, "Lowcore Start"}, + [LOWCORE_END_NR] = {0, 0, "Lowcore End"}, + [IDENTITY_START_NR] = {1, 0, "Identity Mapping Start"}, + [IDENTITY_END_NR] = {0, 0, "Identity Mapping End"}, + [AMODE31_START_NR] = {1, 0, "Amode31 Area Start"}, + [AMODE31_END_NR] = {0, 0, "Amode31 Area End"}, + [KERNEL_START_NR] = {1, (unsigned long)_stext, "Kernel Image Start"}, + [KERNEL_END_NR] = {0, (unsigned long)_end, "Kernel Image End"}, #ifdef CONFIG_KFENCE - [KFENCE_START_NR] = {0, "KFence Pool Start"}, - [KFENCE_END_NR] = {0, "KFence Pool End"}, + [KFENCE_START_NR] = {1, 0, "KFence Pool Start"}, + [KFENCE_END_NR] = {0, 0, "KFence Pool End"}, #endif - [VMEMMAP_NR] = {0, "vmemmap Area Start"}, - [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, - [VMALLOC_NR] = {0, "vmalloc Area Start"}, - [VMALLOC_END_NR] = {0, "vmalloc Area End"}, + [VMEMMAP_NR] = {1, 0, "vmemmap Area Start"}, + [VMEMMAP_END_NR] = {0, 0, "vmemmap Area End"}, + [VMALLOC_NR] = {1, 0, "vmalloc Area Start"}, + [VMALLOC_END_NR] = {0, 0, "vmalloc Area End"}, #ifdef CONFIG_KMSAN - [KMSAN_VMALLOC_SHADOW_START_NR] = {0, "Kmsan vmalloc Shadow Start"}, - [KMSAN_VMALLOC_SHADOW_END_NR] = {0, "Kmsan vmalloc Shadow End"}, - [KMSAN_VMALLOC_ORIGIN_START_NR] = {0, "Kmsan vmalloc Origins Start"}, - [KMSAN_VMALLOC_ORIGIN_END_NR] = {0, "Kmsan vmalloc Origins End"}, - [KMSAN_MODULES_SHADOW_START_NR] = {0, "Kmsan Modules Shadow Start"}, - [KMSAN_MODULES_SHADOW_END_NR] = {0, "Kmsan Modules Shadow End"}, - [KMSAN_MODULES_ORIGIN_START_NR] = {0, "Kmsan Modules Origins Start"}, - [KMSAN_MODULES_ORIGIN_END_NR] = {0, "Kmsan Modules Origins End"}, + [KMSAN_VMALLOC_SHADOW_START_NR] = {1, 0, "Kmsan vmalloc Shadow Start"}, + [KMSAN_VMALLOC_SHADOW_END_NR] = {0, 0, "Kmsan vmalloc Shadow End"}, + [KMSAN_VMALLOC_ORIGIN_START_NR] = {1, 0, "Kmsan vmalloc Origins Start"}, + [KMSAN_VMALLOC_ORIGIN_END_NR] = {0, 0, "Kmsan vmalloc Origins End"}, + [KMSAN_MODULES_SHADOW_START_NR] = {1, 0, "Kmsan Modules Shadow Start"}, + [KMSAN_MODULES_SHADOW_END_NR] = {0, 0, "Kmsan Modules Shadow End"}, + [KMSAN_MODULES_ORIGIN_START_NR] = {1, 0, "Kmsan Modules Origins Start"}, + [KMSAN_MODULES_ORIGIN_END_NR] = {0, 0, "Kmsan Modules Origins End"}, #endif - [MODULES_NR] = {0, "Modules Area Start"}, - [MODULES_END_NR] = {0, "Modules Area End"}, - [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, - [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, - [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, - [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, + [MODULES_NR] = {1, 0, "Modules Area Start"}, + [MODULES_END_NR] = {0, 0, "Modules Area End"}, + [ABS_LOWCORE_NR] = {1, 0, "Lowcore Area Start"}, + [ABS_LOWCORE_END_NR] = {0, 0, "Lowcore Area End"}, + [MEMCPY_REAL_NR] = {1, 0, "Real Memory Copy Area Start"}, + [MEMCPY_REAL_END_NR] = {0, 0, "Real Memory Copy Area End"}, #ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, - [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, + [KASAN_SHADOW_START_NR] = {1, KASAN_SHADOW_START, "Kasan Shadow Start"}, + [KASAN_SHADOW_END_NR] = {0, KASAN_SHADOW_END, "Kasan Shadow End"}, #endif - { -1, NULL } + {1, -1UL, NULL} }; struct pg_state { @@ -287,22 +289,25 @@ static int ptdump_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump); #endif /* CONFIG_PTDUMP_DEBUGFS */ -/* - * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple - * insertion sort to preserve the original order of markers with the same - * start address. - */ -static void sort_address_markers(void) +static int ptdump_cmp(const void *a, const void *b) { - struct addr_marker tmp; - int i, j; + const struct addr_marker *ama = a; + const struct addr_marker *amb = b; - for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) { - tmp = address_markers[i]; - for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--) - address_markers[j + 1] = address_markers[j]; - address_markers[j + 1] = tmp; - } + if (ama->start_address > amb->start_address) + return 1; + if (ama->start_address < amb->start_address) + return -1; + /* + * If the start addresses of two markers are identical consider the + * marker which defines the start of an area higher than the one which + * defines the end of an area. This keeps pairs of markers sorted. + */ + if (ama->is_start) + return 1; + if (amb->is_start) + return -1; + return 0; } static int pt_dump_init(void) @@ -349,7 +354,8 @@ static int pt_dump_init(void) address_markers[KMSAN_MODULES_ORIGIN_START_NR].start_address = KMSAN_MODULES_ORIGIN_START; address_markers[KMSAN_MODULES_ORIGIN_END_NR].start_address = KMSAN_MODULES_ORIGIN_END; #endif - sort_address_markers(); + sort(address_markers, ARRAY_SIZE(address_markers) - 1, + sizeof(address_markers[0]), ptdump_cmp, NULL); #ifdef CONFIG_PTDUMP_DEBUGFS debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); #endif /* CONFIG_PTDUMP_DEBUGFS */ From 1e72ba5566d90a668c1c0fbde319cec03454fd20 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Jul 2024 15:45:58 +0200 Subject: [PATCH 0377/1052] s390/mm: Get rid of RELOC_HIDE() Since __va(0) does not translate to NULL anymore remove RELOC_HIDE() which was only added to get rid of a compile warning with clang W=1: arch/s390/mm/vmem.c:666:36: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic] 666 | __set_memory_4k(__va(0), __va(0) + ident_map_size); | ~~~~~~~ ^ Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/vmem.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 41c714e21292..4edb39e2adb7 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -670,14 +670,8 @@ void __init vmem_map_init(void) */ if (!static_key_enabled(&cpu_has_bear)) set_memory_x(0, 1); - if (debug_pagealloc_enabled()) { - /* - * Use RELOC_HIDE() as long as __va(0) translates to NULL, - * since performing pointer arithmetic on a NULL pointer - * has undefined behavior and generates compiler warnings. - */ - __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size)); - } + if (debug_pagealloc_enabled()) + __set_memory_4k(__va(0), __va(0) + ident_map_size); if (MACHINE_HAS_NX) system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); pr_info("Write protected kernel read-only data: %luk\n", From 75c10d5377d8821efafed32e4d72068d9c1f8ec0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Jul 2024 13:06:43 +0200 Subject: [PATCH 0378/1052] s390/vmlinux.lds.S: Move ro_after_init section behind rodata section The .data.rel.ro and .got section were added between the rodata and ro_after_init data section, which adds an RW mapping in between all RO mapping of the kernel image: ---[ Kernel Image Start ]--- 0x000003ffe0000000-0x000003ffe0e00000 14M PMD RO X 0x000003ffe0e00000-0x000003ffe0ec7000 796K PTE RO X 0x000003ffe0ec7000-0x000003ffe0f00000 228K PTE RO NX 0x000003ffe0f00000-0x000003ffe1300000 4M PMD RO NX 0x000003ffe1300000-0x000003ffe1331000 196K PTE RO NX 0x000003ffe1331000-0x000003ffe13b3000 520K PTE RW NX <--- 0x000003ffe13b3000-0x000003ffe13d5000 136K PTE RO NX 0x000003ffe13d5000-0x000003ffe1400000 172K PTE RW NX 0x000003ffe1400000-0x000003ffe1500000 1M PMD RW NX 0x000003ffe1500000-0x000003ffe1700000 2M PTE RW NX 0x000003ffe1700000-0x000003ffe1800000 1M PMD RW NX 0x000003ffe1800000-0x000003ffe187e000 504K PTE RW NX ---[ Kernel Image End ]--- Move the ro_after_init data section again right behind the rodata section to prevent interleaving RO and RW mappings: ---[ Kernel Image Start ]--- 0x000003ffe0000000-0x000003ffe0e00000 14M PMD RO X 0x000003ffe0e00000-0x000003ffe0ec7000 796K PTE RO X 0x000003ffe0ec7000-0x000003ffe0f00000 228K PTE RO NX 0x000003ffe0f00000-0x000003ffe1300000 4M PMD RO NX 0x000003ffe1300000-0x000003ffe1353000 332K PTE RO NX 0x000003ffe1353000-0x000003ffe1400000 692K PTE RW NX 0x000003ffe1400000-0x000003ffe1500000 1M PMD RW NX 0x000003ffe1500000-0x000003ffe1700000 2M PTE RW NX 0x000003ffe1700000-0x000003ffe1800000 1M PMD RW NX 0x000003ffe1800000-0x000003ffe187e000 504K PTE RW NX ---[ Kernel Image End ]--- Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vmlinux.lds.S | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 975c654cf5a5..e67cd409b858 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -59,14 +59,6 @@ SECTIONS } :text = 0x0700 RO_DATA(PAGE_SIZE) - .data.rel.ro : { - *(.data.rel.ro .data.rel.ro.*) - } - .got : { - __got_start = .; - *(.got) - __got_end = .; - } . = ALIGN(PAGE_SIZE); _sdata = .; /* Start of data section */ @@ -80,6 +72,15 @@ SECTIONS . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; + .data.rel.ro : { + *(.data.rel.ro .data.rel.ro.*) + } + .got : { + __got_start = .; + *(.got) + __got_end = .; + } + RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) .data.rel : { *(.data.rel*) From 33bd8d153c337ba9b30a2e5994437ca703ab4ed8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Jul 2024 13:06:44 +0200 Subject: [PATCH 0379/1052] s390: Keep inittext section writable There is no added security by making the inittext section non-writable, however it does split part of the kernel mapping into 4K mappings instead of 1M mappings: ---[ Kernel Image Start ]--- 0x000003ffe0000000-0x000003ffe0e00000 14M PMD RO X 0x000003ffe0e00000-0x000003ffe0ec7000 796K PTE RO X 0x000003ffe0ec7000-0x000003ffe0f00000 228K PTE RO NX 0x000003ffe0f00000-0x000003ffe1300000 4M PMD RO NX 0x000003ffe1300000-0x000003ffe1353000 332K PTE RO NX 0x000003ffe1353000-0x000003ffe1400000 692K PTE RW NX 0x000003ffe1400000-0x000003ffe1500000 1M PMD RW NX 0x000003ffe1500000-0x000003ffe1700000 2M PTE RW NX <--- 0x000003ffe1700000-0x000003ffe1800000 1M PMD RW NX 0x000003ffe1800000-0x000003ffe187e000 504K PTE RW NX ---[ Kernel Image End ]--- Keep the inittext writable and enable instruction execution protection (aka noexec) later to prevent this. This also allows to use the generic free_initmem() implementation. ---[ Kernel Image Start ]--- 0x000003ffe0000000-0x000003ffe0e00000 14M PMD RO X 0x000003ffe0e00000-0x000003ffe0ec7000 796K PTE RO X 0x000003ffe0ec7000-0x000003ffe0f00000 228K PTE RO NX 0x000003ffe0f00000-0x000003ffe1300000 4M PMD RO NX 0x000003ffe1300000-0x000003ffe1353000 332K PTE RO NX 0x000003ffe1353000-0x000003ffe1400000 692K PTE RW NX 0x000003ffe1400000-0x000003ffe1800000 4M PMD RW NX <--- 0x000003ffe1800000-0x000003ffe187e000 504K PTE RW NX ---[ Kernel Image End ]--- Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/init.c | 9 ++------- arch/s390/mm/vmem.c | 3 --- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ddcd39ef4346..e3d258f9e726 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -108,6 +108,8 @@ void mark_rodata_ro(void) { unsigned long size = __end_ro_after_init - __start_ro_after_init; + if (MACHINE_HAS_NX) + system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); } @@ -170,13 +172,6 @@ void __init mem_init(void) setup_zero_pages(); /* Setup zeroed pages. */ } -void free_initmem(void) -{ - set_memory_rwnx((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT); - free_initmem_default(POISON_FREE_INITMEM); -} - unsigned long memory_block_size_bytes(void) { /* diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 4edb39e2adb7..665b8228afeb 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -661,7 +661,6 @@ void __init vmem_map_init(void) { __set_memory_rox(_stext, _etext); __set_memory_ro(_etext, __end_rodata); - __set_memory_rox(_sinittext, _einittext); __set_memory_rox(__stext_amode31, __etext_amode31); /* * If the BEAR-enhancement facility is not installed the first @@ -672,8 +671,6 @@ void __init vmem_map_init(void) set_memory_x(0, 1); if (debug_pagealloc_enabled()) __set_memory_4k(__va(0), __va(0) + ident_map_size); - if (MACHINE_HAS_NX) - system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } From e1cf752ede8e82c2d084868c50a1ca6cdb07c9c4 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Wed, 12 Jun 2024 11:29:34 +0200 Subject: [PATCH 0380/1052] dt-bindings: eeprom: at25: add fujitsu,mb85rs256 compatible The fujitsu,mb85rs256 is a 256 Kbit SPI memory FRAM in the same family as the two existing fujitsu,mb85rs* compatibles and at25 compatible. Signed-off-by: Francesco Dolcini Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240612092934.12282-1-francesco@dolcini.it Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/eeprom/at25.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/eeprom/at25.yaml b/Documentation/devicetree/bindings/eeprom/at25.yaml index 1715b0c9feea..c31e5e719525 100644 --- a/Documentation/devicetree/bindings/eeprom/at25.yaml +++ b/Documentation/devicetree/bindings/eeprom/at25.yaml @@ -28,6 +28,7 @@ properties: - anvo,anv32e61w - atmel,at25256B - fujitsu,mb85rs1mt + - fujitsu,mb85rs256 - fujitsu,mb85rs64 - microchip,at25160bn - microchip,25lc040 From 6339b7edada2d0c4661bc4200f1adfc80f2e24aa Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 30 Jul 2024 16:01:36 +0530 Subject: [PATCH 0381/1052] nvme: remove a field from nvme_ns_head pi_offset field is not required to be present in nvme_ns_head. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 16 ++++++++-------- drivers/nvme/host/nvme.h | 1 - 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e8afb5a0f3a3..33fa01c599ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -36,6 +36,7 @@ struct nvme_ns_info { struct nvme_ns_ids ids; u32 nsid; __le32 anagrpid; + u8 pi_offset; bool is_shared; bool is_readonly; bool is_ready; @@ -1758,7 +1759,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static bool nvme_init_integrity(struct nvme_ns_head *head, - struct queue_limits *lim) + struct queue_limits *lim, struct nvme_ns_info *info) { struct blk_integrity *bi = &lim->integrity; @@ -1816,7 +1817,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, } bi->tuple_size = head->ms; - bi->pi_offset = head->pi_offset; + bi->pi_offset = info->pi_offset; return true; } @@ -1902,12 +1903,11 @@ static void nvme_configure_pi_elbas(struct nvme_ns_head *head, static void nvme_configure_metadata(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, struct nvme_id_ns *id, - struct nvme_id_ns_nvm *nvm) + struct nvme_id_ns_nvm *nvm, struct nvme_ns_info *info) { head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); head->pi_type = 0; head->pi_size = 0; - head->pi_offset = 0; head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms); if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) return; @@ -1922,7 +1922,7 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, if (head->pi_size && head->ms >= head->pi_size) head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; if (!(id->dps & NVME_NS_DPS_PI_FIRST)) - head->pi_offset = head->ms - head->pi_size; + info->pi_offset = head->ms - head->pi_size; if (ctrl->ops->flags & NVME_F_FABRICS) { /* @@ -2156,7 +2156,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, lim = queue_limits_start_update(ns->disk->queue); nvme_set_ctrl_limits(ns->ctrl, &lim); - nvme_configure_metadata(ns->ctrl, ns->head, id, nvm); + nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info); nvme_set_chunk_sectors(ns, id, &lim); if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; @@ -2176,7 +2176,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case. */ - if (!nvme_init_integrity(ns->head, &lim)) + if (!nvme_init_integrity(ns->head, &lim, info)) capacity = 0; ret = queue_limits_commit_update(ns->disk->queue, &lim); @@ -2280,7 +2280,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) if (unsupported) ns->head->disk->flags |= GENHD_FL_HIDDEN; else - nvme_init_integrity(ns->head, &lim); + nvme_init_integrity(ns->head, &lim, info); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f900e44243ae..c6386af76d24 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -474,7 +474,6 @@ struct nvme_ns_head { u16 ms; u16 pi_size; u8 pi_type; - u8 pi_offset; u8 guard_type; #ifdef CONFIG_BLK_DEV_ZONED u64 zsze; From 73d148ccb9e1b62cdcb65e1c6a461229446a55a2 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 30 Jul 2024 16:01:37 +0530 Subject: [PATCH 0382/1052] nvme: change data type of lba_shift u8 fits the need, so stop using int for it. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c6386af76d24..6039dc78b36e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -470,7 +470,7 @@ struct nvme_ns_head { struct nvme_effects_log *effects; u64 nuse; unsigned ns_id; - int lba_shift; + u8 lba_shift; u16 ms; u16 pi_size; u8 pi_type; From b4c1f33a5d59f577814d87704c45a745a35d8bd9 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 30 Jul 2024 16:01:38 +0530 Subject: [PATCH 0383/1052] nvme: reorganize nvme_ns_head fields shuffle few fields to reduce the holes within nvme_ns_head. On x86_64, the size is reduced to 1104 bytes from 1120 bytes. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6039dc78b36e..ae5314d32943 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -462,19 +462,19 @@ struct nvme_ns_head { struct srcu_struct srcu; struct nvme_subsystem *subsys; struct nvme_ns_ids ids; - struct list_head entry; - struct kref ref; - bool shared; - bool passthru_err_log_enabled; - int instance; - struct nvme_effects_log *effects; - u64 nuse; - unsigned ns_id; u8 lba_shift; u16 ms; u16 pi_size; u8 pi_type; u8 guard_type; + struct list_head entry; + struct kref ref; + bool shared; + bool passthru_err_log_enabled; + struct nvme_effects_log *effects; + u64 nuse; + unsigned ns_id; + int instance; #ifdef CONFIG_BLK_DEV_ZONED u64 zsze; #endif From b75a22e7d4f23dcd4f78ed2ff368a3d2a4556c0c Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Thu, 18 Jul 2024 14:29:59 -0700 Subject: [PATCH 0384/1052] riscv: cpufeature: Do not drop Linux-internal extensions The Linux-internal Xlinuxenvcfg ISA extension is omitted from the riscv_isa_ext array because it has no DT binding and should not appear in /proc/cpuinfo. The logic added in commit 625034abd52a ("riscv: add ISA extensions validation callback") assumes all extensions are included in riscv_isa_ext, and so riscv_resolve_isa() wrongly drops Xlinuxenvcfg from the final ISA string. Instead, accept such Linux-internal ISA extensions as if they have no validation callback. Fixes: 625034abd52a ("riscv: add ISA extensions validation callback") Signed-off-by: Samuel Holland Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20240718213011.2600150-1-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 8f20607adb40..b427188b28fc 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -432,28 +432,26 @@ static void __init riscv_resolve_isa(unsigned long *source_isa, bitmap_copy(prev_resolved_isa, resolved_isa, RISCV_ISA_EXT_MAX); for_each_set_bit(bit, source_isa, RISCV_ISA_EXT_MAX) { ext = riscv_get_isa_ext_data(bit); - if (!ext) - continue; - if (ext->validate) { + if (ext && ext->validate) { ret = ext->validate(ext, resolved_isa); if (ret == -EPROBE_DEFER) { loop = true; continue; } else if (ret) { /* Disable the extension entirely */ - clear_bit(ext->id, source_isa); + clear_bit(bit, source_isa); continue; } } - set_bit(ext->id, resolved_isa); + set_bit(bit, resolved_isa); /* No need to keep it in source isa now that it is enabled */ - clear_bit(ext->id, source_isa); + clear_bit(bit, source_isa); /* Single letter extensions get set in hwcap */ - if (ext->id < RISCV_ISA_EXT_BASE) - *this_hwcap |= isa2hwcap[ext->id]; + if (bit < RISCV_ISA_EXT_BASE) + *this_hwcap |= isa2hwcap[bit]; } } while (loop && memcmp(prev_resolved_isa, resolved_isa, sizeof(prev_resolved_isa))); } From 21b136cc63d2a9ddd60d4699552b69c214b32964 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 30 Jul 2024 15:44:16 -0700 Subject: [PATCH 0385/1052] minmax: fix up min3() and max3() too David Laight pointed out that we should deal with the min3() and max3() mess too, which still does excessive expansion. And our current macros are actually rather broken. In particular, the macros did this: #define min3(x, y, z) min((typeof(x))min(x, y), z) #define max3(x, y, z) max((typeof(x))max(x, y), z) and that not only is a nested expansion of possibly very complex arguments with all that involves, the typing with that "typeof()" cast is completely wrong. For example, imagine what happens in max3() if 'x' happens to be a 'unsigned char', but 'y' and 'z' are 'unsigned long'. The types are compatible, and there's no warning - but the result is just random garbage. No, I don't think we've ever hit that issue in practice, but since we now have sane infrastructure for doing this right, let's just use it. It fixes any excessive expansion, and also avoids these kinds of broken type issues. Requested-by: David Laight Acked-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 41da6f85a407..98008dd92153 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -152,13 +152,20 @@ #define umax(x, y) \ __careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull) +#define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ + __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ + BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz), \ + #op"3("#x", "#y", "#z") signedness error"); \ + __cmp(op, ux, __cmp(op, uy, uz)); }) + /** * min3 - return minimum of three values * @x: first value * @y: second value * @z: third value */ -#define min3(x, y, z) min((typeof(x))min(x, y), z) +#define min3(x, y, z) \ + __careful_op3(min, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) /** * max3 - return maximum of three values @@ -166,7 +173,8 @@ * @y: second value * @z: third value */ -#define max3(x, y, z) max((typeof(x))max(x, y), z) +#define max3(x, y, z) \ + __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero From ba6a9018502eecb94e67001deeb48406fa71f916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Wed, 31 Jul 2024 08:37:25 +0200 Subject: [PATCH 0386/1052] selftests/bpf: do not disable /dev/null device access in cgroup dev test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_dev_cgroup currently loads a small bpf program allowing any access on urandom and zero devices, disabling access to any other device. It makes migrating this test to test_progs impossible, since this one manipulates extensively /dev/null. Allow /dev/null manipulation in dev_cgroup program to make its usage in test_progs framework possible. Update test_dev_cgroup.c as well to match this change while it has not been removed. Reviewed-by: Alan Maguire Acked-by: Stanislav Fomichev Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-1-849425d90de6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/dev_cgroup.c | 4 ++-- tools/testing/selftests/bpf/test_dev_cgroup.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/dev_cgroup.c b/tools/testing/selftests/bpf/progs/dev_cgroup.c index 79b54a4fa244..c1dfbd2b56fc 100644 --- a/tools/testing/selftests/bpf/progs/dev_cgroup.c +++ b/tools/testing/selftests/bpf/progs/dev_cgroup.c @@ -41,14 +41,14 @@ int bpf_prog1(struct bpf_cgroup_dev_ctx *ctx) bpf_trace_printk(fmt, sizeof(fmt), ctx->major, ctx->minor); #endif - /* Allow access to /dev/zero and /dev/random. + /* Allow access to /dev/null and /dev/urandom. * Forbid everything else. */ if (ctx->major != 1 || type != BPF_DEVCG_DEV_CHAR) return 0; switch (ctx->minor) { - case 5: /* 1:5 /dev/zero */ + case 3: /* 1:3 /dev/null */ case 9: /* 1:9 /dev/urandom */ return 1; } diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c index adeaf63cb6fa..33f544f0005a 100644 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ b/tools/testing/selftests/bpf/test_dev_cgroup.c @@ -54,25 +54,25 @@ int main(int argc, char **argv) goto err; } - /* All operations with /dev/zero and and /dev/urandom are allowed, + /* All operations with /dev/null and /dev/urandom are allowed, * everything else is forbidden. */ - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - assert(system("mknod /tmp/test_dev_cgroup_null c 1 3")); - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - - /* /dev/zero is whitelisted */ assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5") == 0); + assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5")); assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - assert(system("dd if=/dev/urandom of=/dev/zero count=64") == 0); + /* /dev/null is whitelisted */ + assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); + assert(system("mknod /tmp/test_dev_cgroup_null c 1 3") == 0); + assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); + + assert(system("dd if=/dev/urandom of=/dev/null count=64") == 0); /* src is allowed, target is forbidden */ assert(system("dd if=/dev/urandom of=/dev/full count=64")); /* src is forbidden, target is allowed */ - assert(system("dd if=/dev/random of=/dev/zero count=64")); + assert(system("dd if=/dev/random of=/dev/null count=64")); error = 0; printf("test_dev_cgroup:PASS\n"); From d83d8230e415ecf727f6b88c0ae55193f149d650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Wed, 31 Jul 2024 08:37:26 +0200 Subject: [PATCH 0387/1052] selftests/bpf: convert test_dev_cgroup to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_dev_cgroup is defined as a standalone test program, and so is not executed in CI. Convert it to test_progs framework so it is tested automatically in CI, and remove the old test. In order to be able to run it in test_progs, /dev/null must remain usable, so change the new test to test operations on devices 1:3 as valid, and operations on devices 1:5 (/dev/zero) as invalid. Reviewed-by: Alan Maguire Acked-by: Stanislav Fomichev Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-2-849425d90de6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 2 - .../selftests/bpf/prog_tests/cgroup_dev.c | 121 ++++++++++++++++++ tools/testing/selftests/bpf/test_dev_cgroup.c | 85 ------------ 4 files changed, 121 insertions(+), 88 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_dev.c delete mode 100644 tools/testing/selftests/bpf/test_dev_cgroup.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 4e4aae8aa7ec..8f14d8faeb0b 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -9,7 +9,6 @@ test_lpm_map test_tag FEATURE-DUMP.libbpf fixdep -test_dev_cgroup /test_progs /test_progs-no_alu32 /test_progs-bpf_gcc diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 774c6270e377..f54185e96a95 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -67,7 +67,6 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_dev_cgroup \ test_sock test_sockmap get_cgroup_id_user \ test_cgroup_storage \ test_tcpnotify_user test_sysctl \ @@ -292,7 +291,6 @@ JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o NETWORK_HELPERS := $(OUTPUT)/network_helpers.o -$(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c new file mode 100644 index 000000000000..8661e145ba84 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "test_progs.h" +#include "cgroup_helpers.h" +#include "dev_cgroup.skel.h" + +#define TEST_CGROUP "/test-bpf-based-device-cgroup/" +#define TEST_BUFFER_SIZE 64 + +static void test_mknod(const char *path, mode_t mode, int dev_major, + int dev_minor, int expected_ret, int expected_errno) +{ + int ret; + + unlink(path); + ret = mknod(path, mode, makedev(dev_major, dev_minor)); + ASSERT_EQ(ret, expected_ret, "mknod"); + if (expected_ret) + ASSERT_EQ(errno, expected_errno, "mknod errno"); + else + unlink(path); +} + +static void test_read(const char *path, char *buf, int buf_size, + int expected_ret, int expected_errno) +{ + int ret, fd; + + fd = open(path, O_RDONLY); + + /* A bare open on unauthorized device should fail */ + if (expected_ret < 0) { + ASSERT_EQ(fd, expected_ret, "open ret for read"); + ASSERT_EQ(errno, expected_errno, "open errno for read"); + if (fd >= 0) + close(fd); + return; + } + + if (!ASSERT_OK_FD(fd, "open ret for read")) + return; + + ret = read(fd, buf, buf_size); + ASSERT_EQ(ret, expected_ret, "read"); + + close(fd); +} + +static void test_write(const char *path, char *buf, int buf_size, + int expected_ret, int expected_errno) +{ + int ret, fd; + + fd = open(path, O_WRONLY); + + /* A bare open on unauthorized device should fail */ + if (expected_ret < 0) { + ASSERT_EQ(fd, expected_ret, "open ret for write"); + ASSERT_EQ(errno, expected_errno, "open errno for write"); + if (fd >= 0) + close(fd); + return; + } + + if (!ASSERT_OK_FD(fd, "open ret for write")) + return; + + ret = write(fd, buf, buf_size); + ASSERT_EQ(ret, expected_ret, "write"); + + close(fd); +} + +void test_cgroup_dev(void) +{ + char buf[TEST_BUFFER_SIZE] = "some random test data"; + struct dev_cgroup *skel; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup switch")) + return; + + skel = dev_cgroup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "load program")) + goto cleanup_cgroup; + + skel->links.bpf_prog1 = + bpf_program__attach_cgroup(skel->progs.bpf_prog1, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_prog1, "attach_program")) + goto cleanup_progs; + + if (test__start_subtest("allow-mknod")) + test_mknod("/dev/test_dev_cgroup_null", S_IFCHR, 1, 3, 0, 0); + + if (test__start_subtest("allow-read")) + test_read("/dev/urandom", buf, TEST_BUFFER_SIZE, + TEST_BUFFER_SIZE, 0); + + if (test__start_subtest("allow-write")) + test_write("/dev/null", buf, TEST_BUFFER_SIZE, + TEST_BUFFER_SIZE, 0); + + if (test__start_subtest("deny-mknod")) + test_mknod("/dev/test_dev_cgroup_zero", S_IFCHR, 1, 5, -1, + EPERM); + + if (test__start_subtest("deny-read")) + test_read("/dev/random", buf, TEST_BUFFER_SIZE, -1, EPERM); + + if (test__start_subtest("deny-write")) + test_write("/dev/zero", buf, TEST_BUFFER_SIZE, -1, EPERM); + +cleanup_progs: + dev_cgroup__destroy(skel); +cleanup_cgroup: + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c deleted file mode 100644 index 33f544f0005a..000000000000 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Copyright (c) 2017 Facebook - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "cgroup_helpers.h" -#include "testing_helpers.h" - -#define DEV_CGROUP_PROG "./dev_cgroup.bpf.o" - -#define TEST_CGROUP "/test-bpf-based-device-cgroup/" - -int main(int argc, char **argv) -{ - struct bpf_object *obj; - int error = EXIT_FAILURE; - int prog_fd, cgroup_fd; - __u32 prog_cnt; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (bpf_prog_test_load(DEV_CGROUP_PROG, BPF_PROG_TYPE_CGROUP_DEVICE, - &obj, &prog_fd)) { - printf("Failed to load DEV_CGROUP program\n"); - goto out; - } - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - if (cgroup_fd < 0) { - printf("Failed to create test cgroup\n"); - goto out; - } - - /* Attach bpf program */ - if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_DEVICE, 0)) { - printf("Failed to attach DEV_CGROUP program"); - goto err; - } - - if (bpf_prog_query(cgroup_fd, BPF_CGROUP_DEVICE, 0, NULL, NULL, - &prog_cnt)) { - printf("Failed to query attached programs"); - goto err; - } - - /* All operations with /dev/null and /dev/urandom are allowed, - * everything else is forbidden. - */ - assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5")); - assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - - /* /dev/null is whitelisted */ - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - assert(system("mknod /tmp/test_dev_cgroup_null c 1 3") == 0); - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - - assert(system("dd if=/dev/urandom of=/dev/null count=64") == 0); - - /* src is allowed, target is forbidden */ - assert(system("dd if=/dev/urandom of=/dev/full count=64")); - - /* src is forbidden, target is allowed */ - assert(system("dd if=/dev/random of=/dev/null count=64")); - - error = 0; - printf("test_dev_cgroup:PASS\n"); - -err: - cleanup_cgroup_environment(); - -out: - return error; -} From 84cdbff4a93501b7199f0d2f1150961ee95c8582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Wed, 31 Jul 2024 08:37:27 +0200 Subject: [PATCH 0388/1052] selftests/bpf: add wrong type test to cgroup dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current cgroup_dev test mostly tests that device operation is accepted or refused base on passed major/minor (and so, any operation performed during test involves only char device) Add a small subtest ensuring that the device type passed to bpf program allows it to take decisions as well. Reviewed-by: Alan Maguire Acked-by: Stanislav Fomichev Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-3-849425d90de6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/cgroup_dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c index 8661e145ba84..5ab7547e38c0 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c @@ -114,6 +114,10 @@ void test_cgroup_dev(void) if (test__start_subtest("deny-write")) test_write("/dev/zero", buf, TEST_BUFFER_SIZE, -1, EPERM); + if (test__start_subtest("deny-mknod-wrong-type")) + test_mknod("/dev/test_dev_cgroup_block", S_IFBLK, 1, 3, -1, + EPERM); + cleanup_progs: dev_cgroup__destroy(skel); cleanup_cgroup: From 478689b5990deb626a0b3f1ebf165979914d6be4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 31 Jul 2024 19:05:15 +0200 Subject: [PATCH 0389/1052] ALSA: hda: Conditionally use snooping for AMD HDMI The recent regression report revealed that the use of WC pages for AMD HDMI device together with AMD IOMMU leads to unexpected truncation or noises. The issue seems triggered by the change in the kernel core memory allocation that enables IOMMU driver to use always S/G buffers. Meanwhile, the use of WC pages has been a workaround for the similar issue with standard pages in the past. So, now we need to apply the workaround conditionally, namely, only when IOMMU isn't in place. This patch modifies the workaround code to check the DMA ops at first and apply the snoop-off only when needed. Fixes: f5ff79fddf0e ("dma-mapping: remove CONFIG_DMA_REMAP") Link: https://bugzilla.kernel.org/show_bug.cgi?id=219087 Link: https://patch.msgid.link/20240731170521.31714-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_controller.h | 2 +- sound/pci/hda/hda_intel.c | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_controller.h b/sound/pci/hda/hda_controller.h index c2d0109866e6..68c883f202ca 100644 --- a/sound/pci/hda/hda_controller.h +++ b/sound/pci/hda/hda_controller.h @@ -28,7 +28,7 @@ #else #define AZX_DCAPS_I915_COMPONENT 0 /* NOP */ #endif -/* 14 unused */ +#define AZX_DCAPS_AMD_ALLOC_FIX (1 << 14) /* AMD allocation workaround */ #define AZX_DCAPS_CTX_WORKAROUND (1 << 15) /* X-Fi workaround */ #define AZX_DCAPS_POSFIX_LPIB (1 << 16) /* Use LPIB as default */ #define AZX_DCAPS_AMD_WORKAROUND (1 << 17) /* AMD-specific workaround */ diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index b33602e64d17..97d33a48ff17 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -40,6 +40,7 @@ #ifdef CONFIG_X86 /* for snoop control */ +#include #include #include #endif @@ -306,7 +307,7 @@ enum { /* quirks for ATI HDMI with snoop off */ #define AZX_DCAPS_PRESET_ATI_HDMI_NS \ - (AZX_DCAPS_PRESET_ATI_HDMI | AZX_DCAPS_SNOOP_OFF) + (AZX_DCAPS_PRESET_ATI_HDMI | AZX_DCAPS_AMD_ALLOC_FIX) /* quirks for AMD SB */ #define AZX_DCAPS_PRESET_AMD_SB \ @@ -1702,6 +1703,13 @@ static void azx_check_snoop_available(struct azx *chip) if (chip->driver_caps & AZX_DCAPS_SNOOP_OFF) snoop = false; +#ifdef CONFIG_X86 + /* check the presence of DMA ops (i.e. IOMMU), disable snoop conditionally */ + if ((chip->driver_caps & AZX_DCAPS_AMD_ALLOC_FIX) && + !get_dma_ops(chip->card->dev)) + snoop = false; +#endif + chip->snoop = snoop; if (!snoop) { dev_info(chip->card->dev, "Force to non-snoop mode\n"); From 170c966cbe274e664288cfc12ee919d5e706dc50 Mon Sep 17 00:00:00 2001 From: Laura Nao Date: Tue, 30 Jul 2024 12:29:28 +0200 Subject: [PATCH 0390/1052] selftests: ksft: Fix finished() helper exit code on skipped tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Python finished() helper currently exits with KSFT_FAIL when there are only passed and skipped tests. Fix the logic to exit with KSFT_PASS instead, making it consistent with its C and bash counterparts (ksft_finished() and ktap_finished() respectively). Reviewed-by: Nícolas F. R. A. Prado Fixes: dacf1d7a78bf ("kselftest: Add test to verify probe of devices from discoverable buses") Signed-off-by: Laura Nao Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest/ksft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest/ksft.py b/tools/testing/selftests/kselftest/ksft.py index cd89fb2bc10e..bf215790a89d 100644 --- a/tools/testing/selftests/kselftest/ksft.py +++ b/tools/testing/selftests/kselftest/ksft.py @@ -70,7 +70,7 @@ def test_result(condition, description=""): def finished(): - if ksft_cnt["pass"] == ksft_num_tests: + if ksft_cnt["pass"] + ksft_cnt["skip"] == ksft_num_tests: exit_code = KSFT_PASS else: exit_code = KSFT_FAIL From e0fa4132bfae725a60c50d53bac80ec31fc20d89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Tue, 2 Jul 2024 18:22:14 -0300 Subject: [PATCH 0391/1052] drm/atomic: Allow userspace to use explicit sync with atomic async flips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow userspace to use explicit synchronization with atomic async flips. That means that the flip will wait for some hardware fence, and then will flip as soon as possible (async) in regard of the vblank. Fixes: 0e26cc72c71c ("drm: Refuse to async flip with atomic prop changes") Signed-off-by: André Almeida Reviewed-by: Simon Ser Signed-off-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20240702212215.109696-1-andrealmeid@igalia.com --- drivers/gpu/drm/drm_atomic_uapi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c index 22bbb2d83e30..2e1d9391febe 100644 --- a/drivers/gpu/drm/drm_atomic_uapi.c +++ b/drivers/gpu/drm/drm_atomic_uapi.c @@ -1070,7 +1070,9 @@ int drm_atomic_set_property(struct drm_atomic_state *state, break; } - if (async_flip && prop != config->prop_fb_id) { + if (async_flip && + prop != config->prop_fb_id && + prop != config->prop_in_fence_fd) { ret = drm_atomic_plane_get_property(plane, plane_state, prop, &old_val); ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); From f85de245c6a8e2654e1e9158588bcf78e38cd5a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Tue, 2 Jul 2024 18:22:15 -0300 Subject: [PATCH 0392/1052] drm/atomic: Allow userspace to use damage clips with async flips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow userspace to use damage clips with atomic async flips. Damage clips are useful for partial plane updates, which can be helpful for clients that want to do flips asynchronously. Fixes: 0e26cc72c71c ("drm: Refuse to async flip with atomic prop changes") Signed-off-by: André Almeida Reviewed-by: Simon Ser Signed-off-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20240702212215.109696-2-andrealmeid@igalia.com --- drivers/gpu/drm/drm_atomic_uapi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c index 2e1d9391febe..7609c798d73d 100644 --- a/drivers/gpu/drm/drm_atomic_uapi.c +++ b/drivers/gpu/drm/drm_atomic_uapi.c @@ -1072,7 +1072,8 @@ int drm_atomic_set_property(struct drm_atomic_state *state, if (async_flip && prop != config->prop_fb_id && - prop != config->prop_in_fence_fd) { + prop != config->prop_in_fence_fd && + prop != config->prop_fb_damage_clips) { ret = drm_atomic_plane_get_property(plane, plane_state, prop, &old_val); ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); From bf514327c324bc8af64f359b341cc9b189c096fd Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 30 Jul 2024 16:15:12 +0200 Subject: [PATCH 0393/1052] x86/setup: Parse the builtin command line before merging Commit in Fixes was added as a catch-all for cases where the cmdline is parsed before being merged with the builtin one. And promptly one issue appeared, see Link below. The microcode loader really needs to parse it that early, but the merging happens later. Reshuffling the early boot nightmare^W code to handle that properly would be a painful exercise for another day so do the chicken thing and parse the builtin cmdline too before it has been merged. Fixes: 0c40b1c7a897 ("x86/setup: Warn when option parsing is done too early") Reported-by: Mike Lothian Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240730152108.GAZqkE5Dfi9AuKllRw@fat_crate.local Link: https://lore.kernel.org/r/20240722152330.GCZp55ck8E_FT4kPnC@fat_crate.local --- arch/x86/include/asm/cmdline.h | 4 ++++ arch/x86/kernel/setup.c | 2 +- arch/x86/lib/cmdline.c | 25 ++++++++++++++++++------- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index 6faaf27e8899..6cbd9ae58b21 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,6 +2,10 @@ #ifndef _ASM_X86_CMDLINE_H #define _ASM_X86_CMDLINE_H +#include + +extern char builtin_cmdline[COMMAND_LINE_SIZE]; + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); int cmdline_find_option(const char *cmdline_ptr, const char *option, char *buffer, int bufsize); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d34cad9b7b1..6129dc2ba784 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -164,7 +164,7 @@ unsigned long saved_video_mode; static char __initdata command_line[COMMAND_LINE_SIZE]; #ifdef CONFIG_CMDLINE_BOOL -static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; bool builtin_cmdline_added __ro_after_init; #endif diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 384da1fdd5c6..c65cd5550454 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -207,18 +207,29 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size, int cmdline_find_option_bool(const char *cmdline, const char *option) { - if (IS_ENABLED(CONFIG_CMDLINE_BOOL)) - WARN_ON_ONCE(!builtin_cmdline_added); + int ret; - return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + if (ret > 0) + return ret; + + if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added) + return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option); + + return ret; } int cmdline_find_option(const char *cmdline, const char *option, char *buffer, int bufsize) { - if (IS_ENABLED(CONFIG_CMDLINE_BOOL)) - WARN_ON_ONCE(!builtin_cmdline_added); + int ret; - return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, - buffer, bufsize); + ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize); + if (ret > 0) + return ret; + + if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added) + return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize); + + return ret; } From 5830aa863981d43560748aa93589c0695191d95d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 25 Jul 2024 12:28:20 -0700 Subject: [PATCH 0394/1052] netfilter: iptables: Fix null-ptr-deref in iptable_nat_table_init(). We had a report that iptables-restore sometimes triggered null-ptr-deref at boot time. [0] The problem is that iptable_nat_table_init() is exposed to user space before the kernel fully initialises netns. In the small race window, a user could call iptable_nat_table_init() that accesses net_generic(net, iptable_nat_net_id), which is available only after registering iptable_nat_net_ops. Let's call register_pernet_subsys() before xt_register_template(). [0]: bpfilter: Loaded bpfilter_umh pid 11702 Started bpfilter BUG: kernel NULL pointer dereference, address: 0000000000000013 PF: supervisor write access in kernel mode PF: error_code(0x0002) - not-present page PGD 0 P4D 0 PREEMPT SMP NOPTI CPU: 2 PID: 11879 Comm: iptables-restor Not tainted 6.1.92-99.174.amzn2023.x86_64 #1 Hardware name: Amazon EC2 c6i.4xlarge/, BIOS 1.0 10/16/2017 RIP: 0010:iptable_nat_table_init (net/ipv4/netfilter/iptable_nat.c:87 net/ipv4/netfilter/iptable_nat.c:121) iptable_nat Code: 10 4c 89 f6 48 89 ef e8 0b 19 bb ff 41 89 c4 85 c0 75 38 41 83 c7 01 49 83 c6 28 41 83 ff 04 75 dc 48 8b 44 24 08 48 8b 0c 24 <48> 89 08 4c 89 ef e8 a2 3b a2 cf 48 83 c4 10 44 89 e0 5b 5d 41 5c RSP: 0018:ffffbef902843cd0 EFLAGS: 00010246 RAX: 0000000000000013 RBX: ffff9f4b052caa20 RCX: ffff9f4b20988d80 RDX: 0000000000000000 RSI: 0000000000000064 RDI: ffffffffc04201c0 RBP: ffff9f4b29394000 R08: ffff9f4b07f77258 R09: ffff9f4b07f77240 R10: 0000000000000000 R11: ffff9f4b09635388 R12: 0000000000000000 R13: ffff9f4b1a3c6c00 R14: ffff9f4b20988e20 R15: 0000000000000004 FS: 00007f6284340000(0000) GS:ffff9f51fe280000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000013 CR3: 00000001d10a6005 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl (arch/x86/kernel/dumpstack.c:259) ? show_trace_log_lvl (arch/x86/kernel/dumpstack.c:259) ? xt_find_table_lock (net/netfilter/x_tables.c:1259) ? __die_body.cold (arch/x86/kernel/dumpstack.c:478 arch/x86/kernel/dumpstack.c:420) ? page_fault_oops (arch/x86/mm/fault.c:727) ? exc_page_fault (./arch/x86/include/asm/irqflags.h:40 ./arch/x86/include/asm/irqflags.h:75 arch/x86/mm/fault.c:1470 arch/x86/mm/fault.c:1518) ? asm_exc_page_fault (./arch/x86/include/asm/idtentry.h:570) ? iptable_nat_table_init (net/ipv4/netfilter/iptable_nat.c:87 net/ipv4/netfilter/iptable_nat.c:121) iptable_nat xt_find_table_lock (net/netfilter/x_tables.c:1259) xt_request_find_table_lock (net/netfilter/x_tables.c:1287) get_info (net/ipv4/netfilter/ip_tables.c:965) ? security_capable (security/security.c:809 (discriminator 13)) ? ns_capable (kernel/capability.c:376 kernel/capability.c:397) ? do_ipt_get_ctl (net/ipv4/netfilter/ip_tables.c:1656) ? bpfilter_send_req (net/bpfilter/bpfilter_kern.c:52) bpfilter nf_getsockopt (net/netfilter/nf_sockopt.c:116) ip_getsockopt (net/ipv4/ip_sockglue.c:1827) __sys_getsockopt (net/socket.c:2327) __x64_sys_getsockopt (net/socket.c:2342 net/socket.c:2339 net/socket.c:2339) do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:81) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121) RIP: 0033:0x7f62844685ee Code: 48 8b 0d 45 28 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 37 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 09 RSP: 002b:00007ffd1f83d638 EFLAGS: 00000246 ORIG_RAX: 0000000000000037 RAX: ffffffffffffffda RBX: 00007ffd1f83d680 RCX: 00007f62844685ee RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000004 R08: 00007ffd1f83d670 R09: 0000558798ffa2a0 R10: 00007ffd1f83d680 R11: 0000000000000246 R12: 00007ffd1f83e3b2 R13: 00007f628455baa0 R14: 00007ffd1f83d7b0 R15: 00007f628457a008 Modules linked in: iptable_nat(+) bpfilter rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache veth xt_state xt_connmark xt_nat xt_statistic xt_MASQUERADE xt_mark xt_addrtype ipt_REJECT nf_reject_ipv4 nft_chain_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 xt_comment nft_compat nf_tables nfnetlink overlay nls_ascii nls_cp437 vfat fat ghash_clmulni_intel aesni_intel ena crypto_simd ptp cryptd i8042 pps_core serio button sunrpc sch_fq_codel configfs loop dm_mod fuse dax dmi_sysfs crc32_pclmul crc32c_intel efivarfs CR2: 0000000000000013 Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Takahiro Kawahara Signed-off-by: Kuniyuki Iwashima Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/iptable_nat.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 4d42d0756fd7..a5db7c67d61b 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -145,25 +145,27 @@ static struct pernet_operations iptable_nat_net_ops = { static int __init iptable_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv4_table, - iptable_nat_table_init); + int ret; + /* net->gen->ptr[iptable_nat_net_id] must be allocated + * before calling iptable_nat_table_init(). + */ + ret = register_pernet_subsys(&iptable_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&iptable_nat_net_ops); - if (ret < 0) { - xt_unregister_template(&nf_nat_ipv4_table); - return ret; - } + ret = xt_register_template(&nf_nat_ipv4_table, + iptable_nat_table_init); + if (ret < 0) + unregister_pernet_subsys(&iptable_nat_net_ops); return ret; } static void __exit iptable_nat_exit(void) { - unregister_pernet_subsys(&iptable_nat_net_ops); xt_unregister_template(&nf_nat_ipv4_table); + unregister_pernet_subsys(&iptable_nat_net_ops); } module_init(iptable_nat_init); From c22921df777de5606f1047b1345b8d22ef1c0b34 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 25 Jul 2024 12:28:21 -0700 Subject: [PATCH 0395/1052] netfilter: iptables: Fix potential null-ptr-deref in ip6table_nat_table_init(). ip6table_nat_table_init() accesses net->gen->ptr[ip6table_nat_net_ops.id], but the function is exposed to user space before the entry is allocated via register_pernet_subsys(). Let's call register_pernet_subsys() before xt_register_template(). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6table_nat.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 52cf104e3478..e119d4f090cc 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -147,23 +147,27 @@ static struct pernet_operations ip6table_nat_net_ops = { static int __init ip6table_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv6_table, - ip6table_nat_table_init); + int ret; + /* net->gen->ptr[ip6table_nat_net_id] must be allocated + * before calling ip6t_nat_register_lookups(). + */ + ret = register_pernet_subsys(&ip6table_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&ip6table_nat_net_ops); + ret = xt_register_template(&nf_nat_ipv6_table, + ip6table_nat_table_init); if (ret) - xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); return ret; } static void __exit ip6table_nat_exit(void) { - unregister_pernet_subsys(&ip6table_nat_net_ops); xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); } module_init(ip6table_nat_init); From 39a3396558fb97e6e7d4c1eb04c2166da31904a9 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Tue, 30 Jul 2024 23:14:40 -0700 Subject: [PATCH 0396/1052] clk: thead: fix dependency on clk_ignore_unused Add the CLK_IGNORE_UNUSED flag to the vp-axi clock (CLK_VP_AXI) to avoid depending on clk_ignore_unused in the cmdline. Without this fix, the emmc-sdio clock (CLK_EMMC_SDIO) fails to work after vp-axi is disabled. Signed-off-by: Drew Fustini Link: https://lore.kernel.org/r/20240731061439.3807172-1-drew@pdp7.com Fixes: ae81b69fd2b1 ("clk: thead: Add support for T-Head TH1520 AP_SUBSYS clocks") Signed-off-by: Stephen Boyd --- drivers/clk/thead/clk-th1520-ap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/thead/clk-th1520-ap.c b/drivers/clk/thead/clk-th1520-ap.c index cbc176b27c09..17e32ae08720 100644 --- a/drivers/clk/thead/clk-th1520-ap.c +++ b/drivers/clk/thead/clk-th1520-ap.c @@ -738,7 +738,7 @@ static struct ccu_div vp_axi_clk = { .hw.init = CLK_HW_INIT_PARENTS_HW("vp-axi", video_pll_clk_parent, &ccu_div_ops, - 0), + CLK_IGNORE_UNUSED), }, }; From 9c685f61722d30a22d55bb8a48f7a48bb2e19bcc Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 15 May 2024 12:55:41 +1000 Subject: [PATCH 0397/1052] nouveau: set placement to original placement on uvmm validate. When a buffer is evicted for memory pressure or TTM evict all, the placement is set to the eviction domain, this means the buffer never gets revalidated on the next exec to the correct domain. I think this should be fine to use the initial domain from the object creation, as least with VM_BIND this won't change after init so this should be the correct answer. Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI") Cc: Danilo Krummrich Cc: # v6.6 Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240515025542.2156774-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nouveau_uvmm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c index 9402fa320a7e..48f105239f42 100644 --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c @@ -1803,6 +1803,7 @@ nouveau_uvmm_bo_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) { struct nouveau_bo *nvbo = nouveau_gem_object(vm_bo->obj); + nouveau_bo_placement_set(nvbo, nvbo->valid_domains, 0); return nouveau_bo_validate(nvbo, true, false); } From d516b187a9cc2e842030dd005be2735db3e8f395 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 30 Jul 2024 21:51:52 +0200 Subject: [PATCH 0398/1052] r8169: don't increment tx_dropped in case of NETDEV_TX_BUSY The skb isn't consumed in case of NETDEV_TX_BUSY, therefore don't increment the tx_dropped counter. Fixes: 188f4af04618 ("r8169: use NETDEV_TX_{BUSY/OK}") Cc: stable@vger.kernel.org Suggested-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/bbba9c48-8bac-4932-9aa1-d2ed63bc9433@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 714d2e804694..3507c2e28110 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4349,7 +4349,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, if (unlikely(!rtl_tx_slots_avail(tp))) { if (net_ratelimit()) netdev_err(dev, "BUG! Tx Ring full when queue awake!\n"); - goto err_stop_0; + netif_stop_queue(dev); + return NETDEV_TX_BUSY; } opts[1] = rtl8169_tx_vlan_tag(skb); @@ -4405,11 +4406,6 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, dev_kfree_skb_any(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; - -err_stop_0: - netif_stop_queue(dev); - dev->stats.tx_dropped++; - return NETDEV_TX_BUSY; } static unsigned int rtl_last_frag_len(struct sk_buff *skb) From 8f73ef82985890e484efaed816b172fdf35c87aa Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jul 2024 09:14:03 -0700 Subject: [PATCH 0399/1052] net: Add skbuff.h to MAINTAINERS The network maintainers need to be copied if the skbuff.h is touched. This also helps git-send-email to figure out the proper maintainers when touching the file. Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240730161404.2028175-1-leitao@debian.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c0a3d9e93689..1ca8e36e49bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15934,6 +15934,7 @@ F: include/linux/in.h F: include/linux/indirect_call_wrapper.h F: include/linux/net.h F: include/linux/netdevice.h +F: include/linux/skbuff.h F: include/net/ F: include/uapi/linux/in.h F: include/uapi/linux/net.h From 1b75da22ed1e6171e261bc9265370162553d5393 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 30 Jul 2024 09:16:30 +0300 Subject: [PATCH 0400/1052] net/mlx5: Always drain health in shutdown callback There is no point in recovery during device shutdown. if health work started need to wait for it to avoid races and NULL pointer access. Hence, drain health WQ on shutdown callback. Fixes: 1958fc2f0712 ("net/mlx5: SF, Add auxiliary device driver") Fixes: d2aa060d40fa ("net/mlx5: Cancel health poll before sending panic teardown command") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 527da58c7953..5b7e6f4b5c7e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -2142,7 +2142,6 @@ static int mlx5_try_fast_unload(struct mlx5_core_dev *dev) /* Panic tear down fw command will stop the PCI bus communication * with the HCA, so the health poll is no longer needed. */ - mlx5_drain_health_wq(dev); mlx5_stop_health_poll(dev, false); ret = mlx5_cmd_fast_teardown_hca(dev); @@ -2177,6 +2176,7 @@ static void shutdown(struct pci_dev *pdev) mlx5_core_info(dev, "Shutdown was called\n"); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); + mlx5_drain_health_wq(dev); err = mlx5_try_fast_unload(dev); if (err) mlx5_unload_one(dev, false); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index b2986175d9af..b706f1486504 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -112,6 +112,7 @@ static void mlx5_sf_dev_shutdown(struct auxiliary_device *adev) struct mlx5_core_dev *mdev = sf_dev->mdev; set_bit(MLX5_BREAK_FW_WAIT, &mdev->intf_state); + mlx5_drain_health_wq(mdev); mlx5_unload_one(mdev, false); } From a4557b0b57c40871ff00da4f623cf79211e052f3 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 30 Jul 2024 09:16:31 +0300 Subject: [PATCH 0401/1052] net/mlx5: Fix error handling in irq_pool_request_irq In case mlx5_irq_alloc fails, the previously allocated index remains in the XArray, which could lead to inconsistencies. Fix it by adding error handling that erases the allocated index from the XArray if mlx5_irq_alloc returns an error. Fixes: c36326d38d93 ("net/mlx5: Round-Robin EQs over IRQs") Signed-off-by: Shay Drory Reviewed-by: Maher Sanalla Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c index f7b01b3f0cba..1477db7f5307 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c @@ -48,6 +48,7 @@ static struct mlx5_irq * irq_pool_request_irq(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc) { struct irq_affinity_desc auto_desc = {}; + struct mlx5_irq *irq; u32 irq_index; int err; @@ -64,9 +65,12 @@ irq_pool_request_irq(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_de else cpu_get(pool, cpumask_first(&af_desc->mask)); } - return mlx5_irq_alloc(pool, irq_index, - cpumask_empty(&auto_desc.mask) ? af_desc : &auto_desc, - NULL); + irq = mlx5_irq_alloc(pool, irq_index, + cpumask_empty(&auto_desc.mask) ? af_desc : &auto_desc, + NULL); + if (IS_ERR(irq)) + xa_erase(&pool->irqs, irq_index); + return irq; } /* Looking for the IRQ with the smallest refcount that fits req_mask. From 94a3ad6c081381fa9ee523781789802b4ed00faf Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 30 Jul 2024 09:16:32 +0300 Subject: [PATCH 0402/1052] net/mlx5: DR, Fix 'stack guard page was hit' error in dr_rule This patch reduces the size of hw_ste_arr_optimized array that is allocated on stack from 640 bytes (5 match STEs + 5 action STES) to 448 bytes (2 match STEs + 5 action STES). This fixes the 'stack guard page was hit' issue, while still fitting majority of the usecases (up to 2 match STEs). Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index 042ca0349124..d1db04baa1fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -7,7 +7,7 @@ /* don't try to optimize STE allocation if the stack is too constaraining */ #define DR_RULE_MAX_STES_OPTIMIZED 0 #else -#define DR_RULE_MAX_STES_OPTIMIZED 5 +#define DR_RULE_MAX_STES_OPTIMIZED 2 #endif #define DR_RULE_MAX_STE_CHAIN_OPTIMIZED (DR_RULE_MAX_STES_OPTIMIZED + DR_ACTION_MAX_STES) From 3fda84dc090390573cfbd0b1d70372663315de21 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 30 Jul 2024 09:16:33 +0300 Subject: [PATCH 0403/1052] net/mlx5: Lag, don't use the hardcoded value of the first port The cited commit didn't change the body of the loop as it should. It shouldn't be using MLX5_LAG_P1. Fixes: 7e978e7714d6 ("net/mlx5: Lag, use actual number of lag ports") Signed-off-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d0871c46b8c5..cf8045b92689 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1538,7 +1538,7 @@ u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, goto unlock; for (i = 0; i < ldev->ports; i++) { - if (ldev->pf[MLX5_LAG_P1].netdev == slave) { + if (ldev->pf[i].netdev == slave) { port = i; break; } From 572f9caa9e7295f8c8822e4122c7ae8f1c412ff9 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 30 Jul 2024 09:16:34 +0300 Subject: [PATCH 0404/1052] net/mlx5: Fix missing lock on sync reset reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On sync reset reload work, when remote host updates devlink on reload actions performed on that host, it misses taking devlink lock before calling devlink_remote_reload_actions_performed() which results in triggering lock assert like the following: WARNING: CPU: 4 PID: 1164 at net/devlink/core.c:261 devl_assert_locked+0x3e/0x50 … CPU: 4 PID: 1164 Comm: kworker/u96:6 Tainted: G S W 6.10.0-rc2+ #116 Hardware name: Supermicro SYS-2028TP-DECTR/X10DRT-PT, BIOS 2.0 12/18/2015 Workqueue: mlx5_fw_reset_events mlx5_sync_reset_reload_work [mlx5_core] RIP: 0010:devl_assert_locked+0x3e/0x50 … Call Trace: ? __warn+0xa4/0x210 ? devl_assert_locked+0x3e/0x50 ? report_bug+0x160/0x280 ? handle_bug+0x3f/0x80 ? exc_invalid_op+0x17/0x40 ? asm_exc_invalid_op+0x1a/0x20 ? devl_assert_locked+0x3e/0x50 devlink_notify+0x88/0x2b0 ? mlx5_attach_device+0x20c/0x230 [mlx5_core] ? __pfx_devlink_notify+0x10/0x10 ? process_one_work+0x4b6/0xbb0 process_one_work+0x4b6/0xbb0 […] Fixes: 84a433a40d0e ("net/mlx5: Lock mlx5 devlink reload callbacks") Signed-off-by: Moshe Shemesh Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 979c49ae6b5c..b43ca0b762c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -207,6 +207,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded) { struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + struct devlink *devlink = priv_to_devlink(dev); /* if this is the driver that initiated the fw reset, devlink completed the reload */ if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { @@ -218,9 +219,11 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); else mlx5_load_one(dev, true); - devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0, + devl_lock(devlink); + devlink_remote_reload_actions_performed(devlink, 0, BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); + devl_unlock(devlink); } } From 06827e27fdcd197557be72b2229dbd362303794f Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 30 Jul 2024 09:16:35 +0300 Subject: [PATCH 0405/1052] net/mlx5e: Require mlx5 tc classifier action support for IPsec prio capability Require mlx5 classifier action support when creating IPSec chains in offload path. MLX5_IPSEC_CAP_PRIO should only be set if CONFIG_MLX5_CLS_ACT is enabled. If CONFIG_MLX5_CLS_ACT=n and MLX5_IPSEC_CAP_PRIO is set, configuring IPsec offload will fail due to the mlxx5 ipsec chain rules failing to be created due to lack of classifier action support. Fixes: fa5aa2f89073 ("net/mlx5e: Use chains for IPsec policy priority offload") Signed-off-by: Rahul Rameshbabu Reviewed-by: Leon Romanovsky Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index 6e00afe4671b..797db853de36 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -51,9 +51,10 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) MLX5_CAP_FLOWTABLE_NIC_RX(mdev, decap)) caps |= MLX5_IPSEC_CAP_PACKET_OFFLOAD; - if ((MLX5_CAP_FLOWTABLE_NIC_TX(mdev, ignore_flow_level) && - MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ignore_flow_level)) || - MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, ignore_flow_level)) + if (IS_ENABLED(CONFIG_MLX5_CLS_ACT) && + ((MLX5_CAP_FLOWTABLE_NIC_TX(mdev, ignore_flow_level) && + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ignore_flow_level)) || + MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, ignore_flow_level))) caps |= MLX5_IPSEC_CAP_PRIO; if (MLX5_CAP_FLOWTABLE_NIC_TX(mdev, From 025f2b85a5e5a46df14ecf162c3c80a957a36d0b Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 30 Jul 2024 09:16:36 +0300 Subject: [PATCH 0406/1052] net/mlx5e: Fix CT entry update leaks of modify header context The cited commit allocates a new modify header to replace the old one when updating CT entry. But if failed to allocate a new one, eg. exceed the max number firmware can support, modify header will be an error pointer that will trigger a panic when deallocating it. And the old modify header point is copied to old attr. When the old attr is freed, the old modify header is lost. Fix it by restoring the old attr to attr when failed to allocate a new modify header context. So when the CT entry is freed, the right modify header context will be freed. And the panic of accessing error pointer is also fixed. Fixes: 94ceffb48eac ("net/mlx5e: Implement CT entry update") Signed-off-by: Chris Mi Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 8cf8ba2622f2..71a168746ebe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -932,6 +932,7 @@ mlx5_tc_ct_entry_replace_rule(struct mlx5_tc_ct_priv *ct_priv, mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, zone_rule->attr, mh); mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id); err_mod_hdr: + *attr = *old_attr; kfree(old_attr); err_attr: kvfree(spec); From 3f8e82a020a5c22f9b791f4ac499b8e18007fbda Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Tue, 30 Jul 2024 09:16:37 +0300 Subject: [PATCH 0407/1052] net/mlx5e: Add a check for the return value from mlx5_port_set_eth_ptys Since the documentation for mlx5_toggle_port_link states that it should only be used after setting the port register, we add a check for the return value from mlx5_port_set_eth_ptys to ensure the register was successfully set before calling it. Fixes: 667daedaecd1 ("net/mlx5e: Toggle link only after modifying port parameters") Signed-off-by: Shahar Shitrit Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 00d5661dc62e..36845872ae94 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1409,7 +1409,12 @@ static int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv, if (!an_changes && link_modes == eproto.admin) goto out; - mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext); + err = mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext); + if (err) { + netdev_err(priv->netdev, "%s: failed to set ptys reg: %d\n", __func__, err); + goto out; + } + mlx5_toggle_port_link(mdev); out: From c4d6a347ba7babdf9d90a0eb24048c266cae0532 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 30 Jul 2024 08:31:04 +0200 Subject: [PATCH 0408/1052] net: wan: fsl_qmc_hdlc: Convert carrier_lock spinlock to a mutex The carrier_lock spinlock protects the carrier detection. While it is held, framer_get_status() is called which in turn takes a mutex. This is not correct and can lead to a deadlock. A run with PROVE_LOCKING enabled detected the issue: [ BUG: Invalid wait context ] ... c204ddbc (&framer->mutex){+.+.}-{3:3}, at: framer_get_status+0x40/0x78 other info that might help us debug this: context-{4:4} 2 locks held by ifconfig/146: #0: c0926a38 (rtnl_mutex){+.+.}-{3:3}, at: devinet_ioctl+0x12c/0x664 #1: c2006a40 (&qmc_hdlc->carrier_lock){....}-{2:2}, at: qmc_hdlc_framer_set_carrier+0x30/0x98 Avoid the spinlock usage and convert carrier_lock to a mutex. Fixes: 54762918ca85 ("net: wan: fsl_qmc_hdlc: Add framer support") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240730063104.179553-1-herve.codina@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_qmc_hdlc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c index c5e7ca793c43..64b4bfa6fea7 100644 --- a/drivers/net/wan/fsl_qmc_hdlc.c +++ b/drivers/net/wan/fsl_qmc_hdlc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,7 @@ struct qmc_hdlc { struct qmc_chan *qmc_chan; struct net_device *netdev; struct framer *framer; - spinlock_t carrier_lock; /* Protect carrier detection */ + struct mutex carrier_lock; /* Protect carrier detection */ struct notifier_block nb; bool is_crc32; spinlock_t tx_lock; /* Protect tx descriptors */ @@ -60,7 +61,7 @@ static int qmc_hdlc_framer_set_carrier(struct qmc_hdlc *qmc_hdlc) if (!qmc_hdlc->framer) return 0; - guard(spinlock_irqsave)(&qmc_hdlc->carrier_lock); + guard(mutex)(&qmc_hdlc->carrier_lock); ret = framer_get_status(qmc_hdlc->framer, &framer_status); if (ret) { @@ -706,7 +707,7 @@ static int qmc_hdlc_probe(struct platform_device *pdev) qmc_hdlc->dev = dev; spin_lock_init(&qmc_hdlc->tx_lock); - spin_lock_init(&qmc_hdlc->carrier_lock); + mutex_init(&qmc_hdlc->carrier_lock); qmc_hdlc->qmc_chan = devm_qmc_chan_get_bychild(dev, dev->of_node); if (IS_ERR(qmc_hdlc->qmc_chan)) From e549360069b4a57e111b8222fc072f3c7c1688ab Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 30 Jul 2024 08:31:33 +0200 Subject: [PATCH 0409/1052] net: wan: fsl_qmc_hdlc: Discard received CRC Received frame from QMC contains the CRC. Upper layers don't need this CRC and tcpdump mentioned trailing junk data due to this CRC presence. As some other HDLC driver, simply discard this CRC. Fixes: d0f2258e79fd ("net: wan: Add support for QMC HDLC") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240730063133.179598-1-herve.codina@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_qmc_hdlc.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c index 64b4bfa6fea7..8fcfbde31a1c 100644 --- a/drivers/net/wan/fsl_qmc_hdlc.c +++ b/drivers/net/wan/fsl_qmc_hdlc.c @@ -250,6 +250,7 @@ static void qmc_hcld_recv_complete(void *context, size_t length, unsigned int fl struct qmc_hdlc_desc *desc = context; struct net_device *netdev; struct qmc_hdlc *qmc_hdlc; + size_t crc_size; int ret; netdev = desc->netdev; @@ -268,15 +269,26 @@ static void qmc_hcld_recv_complete(void *context, size_t length, unsigned int fl if (flags & QMC_RX_FLAG_HDLC_CRC) /* CRC error */ netdev->stats.rx_crc_errors++; kfree_skb(desc->skb); - } else { - netdev->stats.rx_packets++; - netdev->stats.rx_bytes += length; - - skb_put(desc->skb, length); - desc->skb->protocol = hdlc_type_trans(desc->skb, netdev); - netif_rx(desc->skb); + goto re_queue; } + /* Discard the CRC */ + crc_size = qmc_hdlc->is_crc32 ? 4 : 2; + if (length < crc_size) { + netdev->stats.rx_length_errors++; + kfree_skb(desc->skb); + goto re_queue; + } + length -= crc_size; + + netdev->stats.rx_packets++; + netdev->stats.rx_bytes += length; + + skb_put(desc->skb, length); + desc->skb->protocol = hdlc_type_trans(desc->skb, netdev); + netif_rx(desc->skb); + +re_queue: /* Re-queue a transfer using the same descriptor */ ret = qmc_hdlc_recv_queue(qmc_hdlc, desc, desc->dma_size); if (ret) { From 4efce726e0cbc723178eea5b944e13775f628ecc Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 30 Jul 2024 12:40:16 +0200 Subject: [PATCH 0410/1052] net: MAINTAINERS: Demote Qualcomm IPA to "maintained" To the best of my knowledge, Alex Elder is not being paid to support Qualcomm IPA networking drivers, so drop the status from "supported" to "maintained". Signed-off-by: Krzysztof Kozlowski Acked-by: Alex Elder Link: https://patch.msgid.link/20240730104016.22103-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1ca8e36e49bd..11b325268e2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18555,7 +18555,7 @@ F: drivers/usb/misc/qcom_eud.c QCOM IPA DRIVER M: Alex Elder L: netdev@vger.kernel.org -S: Supported +S: Maintained F: drivers/net/ipa/ QEMU MACHINE EMULATOR AND VIRTUALIZER SUPPORT From b9e7fc0aeda79031a101610b2fcb12bf031056e9 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Tue, 30 Jul 2024 10:33:02 -0700 Subject: [PATCH 0411/1052] igc: Fix double reset adapter triggered from a single taprio cmd Following the implementation of "igc: Add TransmissionOverrun counter" patch, when a taprio command is triggered by user, igc processes two commands: TAPRIO_CMD_REPLACE followed by TAPRIO_CMD_STATS. However, both commands unconditionally pass through igc_tsn_offload_apply() which evaluates and triggers reset adapter. The double reset causes issues in the calculation of adapter->qbv_count in igc. TAPRIO_CMD_REPLACE command is expected to reset the adapter since it activates qbv. It's unexpected for TAPRIO_CMD_STATS to do the same because it doesn't configure any driver-specific TSN settings. So, the evaluation in igc_tsn_offload_apply() isn't needed for TAPRIO_CMD_STATS. To address this, commands parsing are relocated to igc_tsn_enable_qbv_scheduling(). Commands that don't require an adapter reset will exit after processing, thus avoiding igc_tsn_offload_apply(). Fixes: d3750076d464 ("igc: Add TransmissionOverrun counter") Signed-off-by: Faizal Rahim Acked-by: Vinicius Costa Gomes Reviewed-by: Vladimir Oltean Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20240730173304.865479-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_main.c | 33 ++++++++++++----------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index cb5c7b09e8a0..8daf938afc36 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6306,21 +6306,6 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, size_t n; int i; - switch (qopt->cmd) { - case TAPRIO_CMD_REPLACE: - break; - case TAPRIO_CMD_DESTROY: - return igc_tsn_clear_schedule(adapter); - case TAPRIO_CMD_STATS: - igc_taprio_stats(adapter->netdev, &qopt->stats); - return 0; - case TAPRIO_CMD_QUEUE_STATS: - igc_taprio_queue_stats(adapter->netdev, &qopt->queue_stats); - return 0; - default: - return -EOPNOTSUPP; - } - if (qopt->base_time < 0) return -ERANGE; @@ -6429,7 +6414,23 @@ static int igc_tsn_enable_qbv_scheduling(struct igc_adapter *adapter, if (hw->mac.type != igc_i225) return -EOPNOTSUPP; - err = igc_save_qbv_schedule(adapter, qopt); + switch (qopt->cmd) { + case TAPRIO_CMD_REPLACE: + err = igc_save_qbv_schedule(adapter, qopt); + break; + case TAPRIO_CMD_DESTROY: + err = igc_tsn_clear_schedule(adapter); + break; + case TAPRIO_CMD_STATS: + igc_taprio_stats(adapter->netdev, &qopt->stats); + return 0; + case TAPRIO_CMD_QUEUE_STATS: + igc_taprio_queue_stats(adapter->netdev, &qopt->queue_stats); + return 0; + default: + return -EOPNOTSUPP; + } + if (err) return err; From a46c68debf3be3a477a69ccbf0a1d050df841676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Mon, 29 Jul 2024 17:17:48 -0700 Subject: [PATCH 0412/1052] ipv6: fix ndisc_is_useropt() handling for PIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current logic only works if the PIO is between two other ND user options. This fixes it so that the PIO can also be either before or after other ND user options (for example the first or last option in the RA). side note: there's actually Android tests verifying a portion of the old broken behaviour, so: https://android-review.googlesource.com/c/kernel/tests/+/3196704 fixes those up. Cc: Jen Linkova Cc: Lorenzo Colitti Cc: Patrick Rohr Cc: David Ahern Cc: YOSHIFUJI Hideaki / 吉藤英明 Cc: Jakub Kicinski Signed-off-by: Maciej Żenczykowski Fixes: 048c796beb6e ("ipv6: adjust ndisc_is_useropt() to also return true for PIO") Link: https://patch.msgid.link/20240730001748.147636-1-maze@google.com Signed-off-by: Paolo Abeni --- net/ipv6/ndisc.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 70a0b2ad6bd7..b8eec1b6cc2c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -227,6 +227,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { + bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; @@ -262,22 +263,23 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, break; #endif default: - if (ndisc_is_useropt(dev, nd_opt)) { - ndopts->nd_useropts_end = nd_opt; - if (!ndopts->nd_useropts) - ndopts->nd_useropts = nd_opt; - } else { - /* - * Unknown options must be silently ignored, - * to accommodate future extension to the - * protocol. - */ - ND_PRINTK(2, notice, - "%s: ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, - nd_opt->nd_opt_len); - } + unknown = true; + } + if (ndisc_is_useropt(dev, nd_opt)) { + ndopts->nd_useropts_end = nd_opt; + if (!ndopts->nd_useropts) + ndopts->nd_useropts = nd_opt; + } else if (unknown) { + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the + * protocol. + */ + ND_PRINTK(2, notice, + "%s: ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, + nd_opt->nd_opt_len); } next_opt: opt_len -= l; From 0a567c2a10033bf04ed618368d179bce6977984b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 Jul 2024 12:10:14 +0200 Subject: [PATCH 0413/1052] mptcp: fix bad RCVPRUNED mib accounting Since its introduction, the mentioned MIB accounted for the wrong event: wake-up being skipped as not-needed on some edge condition instead of incoming skb being dropped after landing in the (subflow) receive queue. Move the increment in the correct location. Fixes: ce599c516386 ("mptcp: properly account bulk freed memory") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a2fc54ed68c0..0d536b183a6c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -350,8 +350,10 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; + } has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; @@ -844,10 +846,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) sk_rbuf = ssk_rbuf; /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ - if (__mptcp_rmem(sk) > sk_rbuf) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); + if (__mptcp_rmem(sk) > sk_rbuf) return; - } /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); From 68cc924729ffcfe90d0383177192030a9aeb2ee4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 Jul 2024 12:10:15 +0200 Subject: [PATCH 0414/1052] mptcp: fix duplicate data handling When a subflow receives and discards duplicate data, the mptcp stack assumes that the consumed offset inside the current skb is zero. With multiple subflows receiving data simultaneously such assertion does not held true. As a result the subflow-level copied_seq will be incorrectly increased and later on the same subflow will observe a bad mapping, leading to subflow reset. Address the issue taking into account the skb consumed offset in mptcp_subflow_discard_data(). Fixes: 04e4cd4f7ca4 ("mptcp: cleanup mptcp_subflow_discard_data()") Cc: stable@vger.kernel.org Link: https://github.com/multipath-tcp/mptcp_net-next/issues/501 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/subflow.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 0e4b5bfbeaa1..a21c712350c3 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1230,14 +1230,22 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - u32 incr; + struct tcp_sock *tp = tcp_sk(ssk); + u32 offset, incr, avail_len; - incr = limit >= skb->len ? skb->len + fin : limit; + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(offset > skb->len)) + goto out; - pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, - subflow->map_subflow_seq); + avail_len = skb->len - offset; + incr = limit >= avail_len ? avail_len + fin : limit; + + pr_debug("discarding=%d len=%d offset=%d seq=%d", incr, skb->len, + offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; + +out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) From a6e9c391d45b5865b61e569146304cff72821a5d Mon Sep 17 00:00:00 2001 From: Camila Alvarez Date: Tue, 30 Jul 2024 19:42:43 -0400 Subject: [PATCH 0415/1052] HID: cougar: fix slab-out-of-bounds Read in cougar_report_fixup report_fixup for the Cougar 500k Gaming Keyboard was not verifying that the report descriptor size was correct before accessing it Reported-by: syzbot+24c0361074799d02c452@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=24c0361074799d02c452 Signed-off-by: Camila Alvarez Reviewed-by: Silvan Jegen Signed-off-by: Jiri Kosina --- drivers/hid/hid-cougar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-cougar.c b/drivers/hid/hid-cougar.c index cb8bd8aae15b..0fa785f52707 100644 --- a/drivers/hid/hid-cougar.c +++ b/drivers/hid/hid-cougar.c @@ -106,7 +106,7 @@ static void cougar_fix_g6_mapping(void) static __u8 *cougar_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { - if (rdesc[2] == 0x09 && rdesc[3] == 0x02 && + if (*rsize >= 117 && rdesc[2] == 0x09 && rdesc[3] == 0x02 && (rdesc[115] | rdesc[116] << 8) >= HID_MAX_USAGES) { hid_info(hdev, "usage count exceeds max: fixing up report descriptor\n"); From 31634d7597d8c57894b6c98eeefc9e58cf842993 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 12 Jul 2024 12:40:19 +0800 Subject: [PATCH 0416/1052] ceph: force sending a cap update msg back to MDS for revoke op If a client sends out a cap update dropping caps with the prior 'seq' just before an incoming cap revoke request, then the client may drop the revoke because it believes it's already released the requested capabilities. This causes the MDS to wait indefinitely for the client to respond to the revoke. It's therefore always a good idea to ack the cap revoke request with the bumped up 'seq'. Currently if the cap->issued equals to the newcaps the check_caps() will do nothing, we should force flush the caps. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/61782 Signed-off-by: Xiubo Li Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 35 ++++++++++++++++++++++++----------- fs/ceph/super.h | 7 ++++--- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e98aa8219303..808c9c048276 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2016,6 +2016,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. + * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without + * further delay. */ void ceph_check_caps(struct ceph_inode_info *ci, int flags) { @@ -2097,7 +2099,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags) } doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " - "flushing %s issued %s revoking %s retain %s %s%s%s\n", + "flushing %s issued %s revoking %s retain %s %s%s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), @@ -2105,7 +2107,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags) ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", - (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "", + (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : ""); /* * If we no longer need to hold onto old our caps, and we may @@ -2180,6 +2183,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags) queue_writeback = true; } + if (flags & CHECK_CAPS_FLUSH_FORCE) { + doutc(cl, "force to flush caps\n"); + goto ack; + } + if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ @@ -3510,6 +3518,8 @@ static void handle_cap_grant(struct inode *inode, bool queue_invalidate = false; bool deleted_inode = false; bool fill_inline = false; + bool revoke_wait = false; + int flags = 0; /* * If there is at least one crypto block then we'll trust @@ -3705,16 +3715,18 @@ static void handle_cap_grant(struct inode *inode, ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); if (S_ISREG(inode->i_mode) && - (revoking & used & CEPH_CAP_FILE_BUFFER)) + (revoking & used & CEPH_CAP_FILE_BUFFER)) { writeback = true; /* initiate writeback; will delay ack */ - else if (queue_invalidate && + revoke_wait = true; + } else if (queue_invalidate && revoking == CEPH_CAP_FILE_CACHE && - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) - ; /* do nothing yet, invalidation will be queued */ - else if (cap == ci->i_auth_cap) + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) { + revoke_wait = true; /* do nothing yet, invalidation will be queued */ + } else if (cap == ci->i_auth_cap) { check_caps = 1; /* check auth cap only */ - else + } else { check_caps = 2; /* check all caps */ + } /* If there is new caps, try to wake up the waiters */ if (~cap->issued & newcaps) wake = true; @@ -3741,8 +3753,9 @@ static void handle_cap_grant(struct inode *inode, BUG_ON(cap->issued & ~cap->implemented); /* don't let check_caps skip sending a response to MDS for revoke msgs */ - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { + if (!revoke_wait && le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { cap->mds_wanted = 0; + flags |= CHECK_CAPS_FLUSH_FORCE; if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else @@ -3798,9 +3811,9 @@ static void handle_cap_grant(struct inode *inode, mutex_unlock(&session->s_mutex); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); + ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NOINVAL); + ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL); } /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b63b4cd9b5b6..6e817bf1337c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -200,9 +200,10 @@ struct ceph_cap { struct list_head caps_item; }; -#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ -#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ -#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ +#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ +#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ +#define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */ struct ceph_cap_flush { u64 tid; From 7354eb7f1558466e92e926802d36e69e42938ea9 Mon Sep 17 00:00:00 2001 From: Curtis Malainey Date: Wed, 31 Jul 2024 14:21:44 -0700 Subject: [PATCH 0417/1052] ASoC: SOF: Remove libraries from topology lookups Default firmware shipped in open source are not licensed for 3P libraries, therefore topologies should not reference them. If a OS wants to use 3P (that they have licensed) then they should use the appropriate topology override mechanisms. Fixes: 8a7d5d85ed2161 ("ASoC: SOF: mediatek: mt8195: Add devicetree support to select topologies") Signed-off-by: Curtis Malainey Cc: Wojciech Macek Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20240731212153.921327-1-cujomalainey@chromium.org Signed-off-by: Mark Brown --- sound/soc/sof/mediatek/mt8195/mt8195.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/mediatek/mt8195/mt8195.c b/sound/soc/sof/mediatek/mt8195/mt8195.c index 24ae1d4959be..1c6e035fd313 100644 --- a/sound/soc/sof/mediatek/mt8195/mt8195.c +++ b/sound/soc/sof/mediatek/mt8195/mt8195.c @@ -573,7 +573,7 @@ static const struct snd_sof_dsp_ops sof_mt8195_ops = { static struct snd_sof_of_mach sof_mt8195_machs[] = { { .compatible = "google,tomato", - .sof_tplg_filename = "sof-mt8195-mt6359-rt1019-rt5682-dts.tplg" + .sof_tplg_filename = "sof-mt8195-mt6359-rt1019-rt5682.tplg" }, { .compatible = "mediatek,mt8195", .sof_tplg_filename = "sof-mt8195.tplg" From 41e71dbb0e0a0fe214545fe64af031303a08524c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 Jul 2024 18:31:05 +0200 Subject: [PATCH 0418/1052] x86/mm: Fix pti_clone_pgtable() alignment assumption Guenter reported dodgy crashes on an i386-nosmp build using GCC-11 that had the form of endless traps until entry stack exhaust and then #DF from the stack guard. It turned out that pti_clone_pgtable() had alignment assumptions on the start address, notably it hard assumes start is PMD aligned. This is true on x86_64, but very much not true on i386. These assumptions can cause the end condition to malfunction, leading to a 'short' clone. Guess what happens when the user mapping has a short copy of the entry text? Use the correct increment form for addr to avoid alignment assumptions. Fixes: 16a3fe634f6a ("x86/mm/pti: Clone kernel-image on PTE level for 32 bit") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240731163105.GG33588@noisy.programming.kicks-ass.net --- arch/x86/mm/pti.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 2e69abf4f852..48c503208c79 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -374,14 +374,14 @@ pti_clone_pgtable(unsigned long start, unsigned long end, */ *target_pmd = *pmd; - addr += PMD_SIZE; + addr = round_up(addr + 1, PMD_SIZE); } else if (level == PTI_CLONE_PTE) { /* Walk the page-table down to the pte level */ pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { - addr += PAGE_SIZE; + addr = round_up(addr + 1, PAGE_SIZE); continue; } @@ -401,7 +401,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, /* Clone the PTE */ *target_pte = *pte; - addr += PAGE_SIZE; + addr = round_up(addr + 1, PAGE_SIZE); } else { BUG(); From 3db03fb4995ef85fc41e86262ead7b4852f4bcf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Aug 2024 12:42:25 +0200 Subject: [PATCH 0419/1052] x86/mm: Fix pti_clone_entry_text() for i386 While x86_64 has PMD aligned text sections, i386 does not have this luxery. Notably ALIGN_ENTRY_TEXT_END is empty and _etext has PAGE alignment. This means that text on i386 can be page granular at the tail end, which in turn means that the PTI text clones should consistently account for this. Make pti_clone_entry_text() consistent with pti_clone_kernel_text(). Fixes: 16a3fe634f6a ("x86/mm/pti: Clone kernel-image on PTE level for 32 bit") Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/mm/pti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 48c503208c79..bfdf5f45b137 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -496,7 +496,7 @@ static void pti_clone_entry_text(void) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, - PTI_CLONE_PMD); + PTI_LEVEL_KERNEL_IMAGE); } /* From b07ce24df7fee54189a7bca583bcda81a5448198 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 29 Jul 2024 23:44:59 +0200 Subject: [PATCH 0420/1052] alpha: fix ioread64be()/iowrite64be() helpers Compile-testing the crypto/caam driver on alpha showed a pre-existing problem on alpha with iowrite64be() missing: ERROR: modpost: "iowrite64be" [drivers/crypto/caam/caam_jr.ko] undefined! The prototypes were added a while ago when we started using asm-generic/io.h, but the implementation was still missing. At some point the ioread64/iowrite64 helpers were added, but the big-endian versions are still missing, and the generic version (using readq/writeq) is would not work here. Change it to wrap ioread64()/iowrite64() instead. Fixes: beba3771d9e0 ("crypto: caam: Make CRYPTO_DEV_FSL_CAAM dependent of COMPILE_TEST") Fixes: e19d4ebc536d ("alpha: add full ioread64/iowrite64 implementation") Fixes: 7e772dad9913 ("alpha: Use generic ") Closes: https://lore.kernel.org/all/CAHk-=wgEyzSxTs467NDOVfBSzWvUS6ztcwhiy=M3xog==KBmTw@mail.gmail.com/ Tested-by: Guenter Roeck Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann --- arch/alpha/include/asm/io.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 2bb8cbeedf91..b191d87f89c4 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -534,8 +534,10 @@ extern inline void writeq(u64 b, volatile void __iomem *addr) #define ioread16be(p) swab16(ioread16(p)) #define ioread32be(p) swab32(ioread32(p)) +#define ioread64be(p) swab64(ioread64(p)) #define iowrite16be(v,p) iowrite16(swab16(v), (p)) #define iowrite32be(v,p) iowrite32(swab32(v), (p)) +#define iowrite64be(v,p) iowrite64(swab64(v), (p)) #define inb_p inb #define inw_p inw @@ -634,8 +636,6 @@ extern void outsl (unsigned long port, const void *src, unsigned long count); */ #define ioread64 ioread64 #define iowrite64 iowrite64 -#define ioread64be ioread64be -#define iowrite64be iowrite64be #define ioread8_rep ioread8_rep #define ioread16_rep ioread16_rep #define ioread32_rep ioread32_rep From fb197c5d2fd24b9af3d4697d0cf778645846d6d5 Mon Sep 17 00:00:00 2001 From: Daniel Maslowski Date: Fri, 19 Jul 2024 19:04:37 +0200 Subject: [PATCH 0421/1052] riscv/purgatory: align riscv_kernel_entry When alignment handling is delegated to the kernel, everything must be word-aligned in purgatory, since the trap handler is then set to the kexec one. Without the alignment, hitting the exception would ultimately crash. On other occasions, the kernel's handler would take care of exceptions. This has been tested on a JH7110 SoC with oreboot and its SBI delegating unaligned access exceptions and the kernel configured to handle them. Fixes: 736e30af583fb ("RISC-V: Add purgatory") Signed-off-by: Daniel Maslowski Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240719170437.247457-1-cyrevolt@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/purgatory/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/purgatory/entry.S b/arch/riscv/purgatory/entry.S index 5bcf3af903da..0e6ca6d5ae4b 100644 --- a/arch/riscv/purgatory/entry.S +++ b/arch/riscv/purgatory/entry.S @@ -7,6 +7,7 @@ * Author: Li Zhengyu (lizhengyu3@huawei.com) * */ +#include #include .text @@ -34,6 +35,7 @@ SYM_CODE_END(purgatory_start) .data +.align LGREG SYM_DATA(riscv_kernel_entry, .quad 0) .end From 63ba5b0fb4f54db256ec43b3062b2606b383055d Mon Sep 17 00:00:00 2001 From: Eric Lin Date: Fri, 19 Jul 2024 19:50:18 +0800 Subject: [PATCH 0422/1052] perf arch events: Fix duplicate RISC-V SBI firmware event name Currently, the RISC-V firmware JSON file has duplicate event name "FW_SFENCE_VMA_RECEIVED". According to the RISC-V SBI PMU extension[1], the event name should be "FW_SFENCE_VMA_ASID_SENT". Before this patch: $ perf list firmware: fw_access_load [Load access trap event. Unit: cpu] fw_access_store [Store access trap event. Unit: cpu] .... fw_set_timer [Set timer event. Unit: cpu] fw_sfence_vma_asid_received [Received SFENCE.VMA with ASID request from other HART event. Unit: cpu] fw_sfence_vma_received [Sent SFENCE.VMA with ASID request to other HART event. Unit: cpu] After this patch: $ perf list firmware: fw_access_load [Load access trap event. Unit: cpu] fw_access_store [Store access trap event. Unit: cpu] ..... fw_set_timer [Set timer event. Unit: cpu] fw_sfence_vma_asid_received [Received SFENCE.VMA with ASID request from other HART event. Unit: cpu] fw_sfence_vma_asid_sent [Sent SFENCE.VMA with ASID request to other HART event. Unit: cpu] fw_sfence_vma_received [Received SFENCE.VMA request from other HART event. Unit: cpu] Link: https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-pmu.adoc#event-firmware-events-type-15 [1] Fixes: 8f0dcb4e7364 ("perf arch events: riscv sbi firmware std event files") Fixes: c4f769d4093d ("perf vendor events riscv: add Sifive U74 JSON file") Fixes: acbf6de674ef ("perf vendor events riscv: Add StarFive Dubhe-80 JSON file") Fixes: 7340c6df49df ("perf vendor events riscv: add T-HEAD C9xx JSON file") Fixes: f5102e31c209 ("riscv: andes: Support specifying symbolic firmware and hardware raw event") Signed-off-by: Eric Lin Reviewed-by: Samuel Holland Reviewed-by: Nikita Shubin Reviewed-by: Inochi Amaoto Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20240719115018.27356-1-eric.lin@sifive.com Signed-off-by: Palmer Dabbelt --- tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json | 2 +- tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json | 2 +- tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json | 2 +- .../perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json | 2 +- .../perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" diff --git a/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json index a9939823b14b..0c9b9a2d2958 100644 --- a/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json +++ b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json @@ -74,7 +74,7 @@ { "PublicDescription": "Sent SFENCE.VMA with ASID request to other HART event", "ConfigCode": "0x800000000000000c", - "EventName": "FW_SFENCE_VMA_RECEIVED", + "EventName": "FW_SFENCE_VMA_ASID_SENT", "BriefDescription": "Sent SFENCE.VMA with ASID request to other HART event" }, { diff --git a/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" diff --git a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" diff --git a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" From 57e5c814e91577a464484cc4b1a56ff86371a713 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Mon, 22 Jul 2024 08:45:20 -0700 Subject: [PATCH 0423/1052] cache: StarFive: Require a 64-bit system This has a bunch of {read,write}q() calls, so it won't work on 32-bit systems. I don't think there's any 32-bit StarFive systems, so for now just require 64-bit. Fixes: cabff60ca77d ("cache: Add StarFive StarLink cache management") Acked-by: Conor Dooley Reviewed-by: Emil Renner Berthing Link: https://lore.kernel.org/r/20240722154519.25375-2-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- drivers/cache/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cache/Kconfig b/drivers/cache/Kconfig index 94abd8f632a7..db51386c663a 100644 --- a/drivers/cache/Kconfig +++ b/drivers/cache/Kconfig @@ -18,6 +18,7 @@ config STARFIVE_STARLINK_CACHE bool "StarFive StarLink Cache controller" depends on RISCV depends on ARCH_STARFIVE + depends on 64BIT select RISCV_DMA_NONCOHERENT select RISCV_NONSTANDARD_CACHE_OPS help From 941a8e9b7a86763ac52d5bf6ccc9986d37fde628 Mon Sep 17 00:00:00 2001 From: Shifrin Dmitry Date: Mon, 29 Jul 2024 15:58:58 +0300 Subject: [PATCH 0424/1052] perf: riscv: Fix selecting counters in legacy mode It is required to check event type before checking event config. Events with the different types can have the same config. This check is missed for legacy mode code For such perf usage: sysctl -w kernel.perf_user_access=2 perf stat -e cycles,L1-dcache-loads -- driver will try to force both events to CYCLE counter. This commit implements event type check before forcing events on the special counters. Signed-off-by: Shifrin Dmitry Reviewed-by: Atish Patra Fixes: cc4c07c89aad ("drivers: perf: Implement perf event mmap support in the SBI backend") Link: https://lore.kernel.org/r/20240729125858.630653-1-dmitry.shifrin@syntacore.com Signed-off-by: Palmer Dabbelt --- drivers/perf/riscv_pmu_sbi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 44d3951d009f..31a17a56eb3b 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -416,7 +416,7 @@ static int pmu_sbi_ctr_get_idx(struct perf_event *event) * but not in the user access mode as we want to use the other counters * that support sampling/filtering. */ - if (hwc->flags & PERF_EVENT_FLAG_LEGACY) { + if ((hwc->flags & PERF_EVENT_FLAG_LEGACY) && (event->attr.type == PERF_TYPE_HARDWARE)) { if (event->attr.config == PERF_COUNT_HW_CPU_CYCLES) { cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH; cmask = 1; From 0c710050c47d45eb77b28c271cddefc5c785cb40 Mon Sep 17 00:00:00 2001 From: Zhe Qiao Date: Wed, 31 Jul 2024 16:45:47 +0800 Subject: [PATCH 0425/1052] riscv/mm: Add handling for VM_FAULT_SIGSEGV in mm_fault_error() Handle VM_FAULT_SIGSEGV in the page fault path so that we correctly kill the process and we don't BUG() the kernel. Fixes: 07037db5d479 ("RISC-V: Paging and MMU") Signed-off-by: Zhe Qiao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240731084547.85380-1-qiaozhe@iscas.ac.cn Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 5224f3733802..a9f2b4af8f3f 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -61,26 +61,27 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr) static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) { + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ - if (!user_mode(regs)) { - no_context(regs, addr); - return; - } pagefault_out_of_memory(); return; } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) { - no_context(regs, addr); - return; - } do_trap(regs, SIGBUS, BUS_ADRERR, addr); return; + } else if (fault & VM_FAULT_SIGSEGV) { + do_trap(regs, SIGSEGV, SEGV_MAPERR, addr); + return; } + BUG(); } From 3908ba2e0b2476e2ec13e15967bf6a37e449f2af Mon Sep 17 00:00:00 2001 From: Nick Hu Date: Wed, 17 Jul 2024 11:17:14 +0800 Subject: [PATCH 0426/1052] RISC-V: Enable the IPI before workqueue_online_cpu() Sometimes the hotplug cpu stalls at the arch_cpu_idle() for a while after workqueue_online_cpu(). When cpu stalls at the idle loop, the reschedule IPI is pending. However the enable bit is not enabled yet so the cpu stalls at WFI until watchdog timeout. Therefore enable the IPI before the workqueue_online_cpu() to fix the issue. Fixes: 63c5484e7495 ("workqueue: Add multiple affinity scopes and interface to select them") Signed-off-by: Nick Hu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20240717031714.1946036-1-nick.hu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi-ipi.c | 2 +- include/linux/cpuhotplug.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/sbi-ipi.c b/arch/riscv/kernel/sbi-ipi.c index 1026e22955cc..0cc5559c08d8 100644 --- a/arch/riscv/kernel/sbi-ipi.c +++ b/arch/riscv/kernel/sbi-ipi.c @@ -71,7 +71,7 @@ void __init sbi_ipi_init(void) * the masking/unmasking of virtual IPIs is done * via generic IPI-Mux */ - cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + cpuhp_setup_state(CPUHP_AP_IRQ_RISCV_SBI_IPI_STARTING, "irqchip/sbi-ipi:starting", sbi_ipi_starting_cpu, NULL); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 51ba681b915a..e30d93b807d5 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -148,6 +148,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_LOONGARCH_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_IRQ_RISCV_IMSIC_STARTING, + CPUHP_AP_IRQ_RISCV_SBI_IPI_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, From 58d245e03c324d083a0ec3b9ab8ebd46ec9848d7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 1 Aug 2024 11:18:01 +0100 Subject: [PATCH 0427/1052] arm64: cputype: Add Cortex-X1C definitions Add cputype definitions for Cortex-X1C. These will be used for errata detection in subsequent patches. These values can be found in the Cortex-X1C TRM: https://developer.arm.com/documentation/101968/0002/ ... in section B2.107 ("MIDR_EL1, Main ID Register, EL1"). Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240801101803.1982459-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 1cb0704c6163..5dc68ace305e 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -86,6 +86,7 @@ #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define ARM_CPU_PART_CORTEX_A78C 0xD4B +#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_X3 0xD4E #define ARM_CPU_PART_NEOVERSE_V2 0xD4F #define ARM_CPU_PART_CORTEX_A720 0xD81 @@ -165,6 +166,7 @@ #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_X3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X3) #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) #define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720) From 9ef54a384526911095db465e77acc1cb5266b32c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 1 Aug 2024 11:18:02 +0100 Subject: [PATCH 0428/1052] arm64: cputype: Add Cortex-A725 definitions Add cputype definitions for Cortex-A725. These will be used for errata detection in subsequent patches. These values can be found in the Cortex-A725 TRM: https://developer.arm.com/documentation/107652/0001/ ... in table A-247 ("MIDR_EL1 bit descriptions"). Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240801101803.1982459-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5dc68ace305e..5fd7caea4419 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -93,6 +93,7 @@ #define ARM_CPU_PART_CORTEX_X4 0xD82 #define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define ARM_CPU_PART_CORTEX_X925 0xD85 +#define ARM_CPU_PART_CORTEX_A725 0xD87 #define APM_CPU_PART_XGENE 0x000 #define APM_CPU_VAR_POTENZA 0x00 @@ -173,6 +174,7 @@ #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) +#define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) From adeec61a4723fd3e39da68db4cc4d924e6d7f641 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 1 Aug 2024 11:18:03 +0100 Subject: [PATCH 0429/1052] arm64: errata: Expand speculative SSBS workaround (again) A number of Arm Ltd CPUs suffer from errata whereby an MSR to the SSBS special-purpose register does not affect subsequent speculative instructions, permitting speculative store bypassing for a window of time. We worked around this for a number of CPUs in commits: * 7187bb7d0b5c7dfa ("arm64: errata: Add workaround for Arm errata 3194386 and 3312417") * 75b3c43eab594bfb ("arm64: errata: Expand speculative SSBS workaround") Since then, similar errata have been published for a number of other Arm Ltd CPUs, for which the same mitigation is sufficient. This is described in their respective Software Developer Errata Notice (SDEN) documents: * Cortex-A76 (MP052) SDEN v31.0, erratum 3324349 https://developer.arm.com/documentation/SDEN-885749/3100/ * Cortex-A77 (MP074) SDEN v19.0, erratum 3324348 https://developer.arm.com/documentation/SDEN-1152370/1900/ * Cortex-A78 (MP102) SDEN v21.0, erratum 3324344 https://developer.arm.com/documentation/SDEN-1401784/2100/ * Cortex-A78C (MP138) SDEN v16.0, erratum 3324346 https://developer.arm.com/documentation/SDEN-1707916/1600/ * Cortex-A78C (MP154) SDEN v10.0, erratum 3324347 https://developer.arm.com/documentation/SDEN-2004089/1000/ * Cortex-A725 (MP190) SDEN v5.0, erratum 3456106 https://developer.arm.com/documentation/SDEN-2832921/0500/ * Cortex-X1 (MP077) SDEN v21.0, erratum 3324344 https://developer.arm.com/documentation/SDEN-1401782/2100/ * Cortex-X1C (MP136) SDEN v16.0, erratum 3324346 https://developer.arm.com/documentation/SDEN-1707914/1600/ * Neoverse-N1 (MP050) SDEN v32.0, erratum 3324349 https://developer.arm.com/documentation/SDEN-885747/3200/ * Neoverse-V1 (MP076) SDEN v19.0, erratum 3324341 https://developer.arm.com/documentation/SDEN-1401781/1900/ Note that due to the manner in which Arm develops IP and tracks errata, some CPUs share a common erratum number and some CPUs have multiple erratum numbers for the same HW issue. On parts without SB, it is necessary to use ISB for the workaround. The spec_bar() macro used in the mitigation will expand to a "DSB SY; ISB" sequence in this case, which is sufficient on all affected parts. Enable the existing mitigation by adding the relevant MIDRs to erratum_spec_ssbs_list. The list is sorted alphanumerically (involving moving Neoverse-V3 after Neoverse-V2) so that this is easy to audit and potentially extend again in future. The Kconfig text is also updated to clarify the set of affected parts and the mitigation. Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Reviewed-by: Anshuman Khandual Acked-by: Will Deacon Link: https://lore.kernel.org/r/20240801101803.1982459-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/silicon-errata.rst | 18 +++++++++++++++++ arch/arm64/Kconfig | 22 +++++++++++++++------ arch/arm64/kernel/cpu_errata.c | 11 ++++++++++- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index bb83c5d8c675..50327c05be8d 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -122,10 +122,18 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1490853 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A76 | #3324349 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A77 | #1491015 | N/A | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A77 | #1508412 | ARM64_ERRATUM_1508412 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A77 | #3324348 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A78 | #3324344 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A78C | #3324346,3324347| ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 | @@ -138,8 +146,14 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A720 | #3456091 | ARM64_ERRATUM_3194386 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A725 | #3456106 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-X1 | #1502854 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-X1 | #3324344 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-X1C | #3324346 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-X2 | #2119858 | ARM64_ERRATUM_2119858 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-X2 | #2224489 | ARM64_ERRATUM_2224489 | @@ -160,6 +174,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1542419 | ARM64_ERRATUM_1542419 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-N1 | #3324349 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N2 | #2139208 | ARM64_ERRATUM_2139208 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N2 | #2067961 | ARM64_ERRATUM_2067961 | @@ -170,6 +186,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-V1 | #1619801 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-V1 | #3324341 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-V2 | #3324336 | ARM64_ERRATUM_3194386 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-V3 | #3312417 | ARM64_ERRATUM_3194386 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b3fc891f1544..a2f8ff354ca6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1069,18 +1069,28 @@ config ARM64_ERRATUM_3117295 If unsure, say Y. config ARM64_ERRATUM_3194386 - bool "Cortex-{A720,X4,X925}/Neoverse-V3: workaround for MSR SSBS not self-synchronizing" + bool "Cortex-*/Neoverse-*: workaround for MSR SSBS not self-synchronizing" default y help This option adds the workaround for the following errata: + * ARM Cortex-A76 erratum 3324349 + * ARM Cortex-A77 erratum 3324348 + * ARM Cortex-A78 erratum 3324344 + * ARM Cortex-A78C erratum 3324346 + * ARM Cortex-A78C erratum 3324347 * ARM Cortex-A710 erratam 3324338 * ARM Cortex-A720 erratum 3456091 + * ARM Cortex-A725 erratum 3456106 + * ARM Cortex-X1 erratum 3324344 + * ARM Cortex-X1C erratum 3324346 * ARM Cortex-X2 erratum 3324338 * ARM Cortex-X3 erratum 3324335 * ARM Cortex-X4 erratum 3194386 * ARM Cortex-X925 erratum 3324334 + * ARM Neoverse-N1 erratum 3324349 * ARM Neoverse N2 erratum 3324339 + * ARM Neoverse-V1 erratum 3324341 * ARM Neoverse V2 erratum 3324336 * ARM Neoverse-V3 erratum 3312417 @@ -1088,11 +1098,11 @@ config ARM64_ERRATUM_3194386 subsequent speculative instructions, which may permit unexepected speculative store bypassing. - Work around this problem by placing a speculation barrier after - kernel changes to SSBS. The presence of the SSBS special-purpose - register is hidden from hwcaps and EL0 reads of ID_AA64PFR1_EL1, such - that userspace will use the PR_SPEC_STORE_BYPASS prctl to change - SSBS. + Work around this problem by placing a Speculation Barrier (SB) or + Instruction Synchronization Barrier (ISB) after kernel changes to + SSBS. The presence of the SSBS special-purpose register is hidden + from hwcaps and EL0 reads of ID_AA64PFR1_EL1, such that userspace + will use the PR_SPEC_STORE_BYPASS prctl to change SSBS. If unsure, say Y. diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 617424b73f8c..f6b6b4507357 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -434,15 +434,24 @@ static const struct midr_range erratum_spec_unpriv_load_list[] = { #ifdef CONFIG_ARM64_ERRATUM_3194386 static const struct midr_range erratum_spec_ssbs_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), MIDR_ALL_VERSIONS(MIDR_CORTEX_X4), MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), {} }; #endif From f126745da81783fb1d082e67bf14c6795e489a88 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 29 Jul 2024 14:22:49 +0000 Subject: [PATCH 0430/1052] rust: SHADOW_CALL_STACK is incompatible with Rust When using the shadow call stack sanitizer, all code must be compiled with the -ffixed-x18 flag, but this flag is not currently being passed to Rust. This results in crashes that are extremely difficult to debug. To ensure that nobody else has to go through the same debugging session that I had to, prevent configurations that enable both SHADOW_CALL_STACK and RUST. It is rather common for people to backport 724a75ac9542 ("arm64: rust: Enable Rust support for AArch64"), so I recommend applying this fix all the way back to 6.1. Cc: stable@vger.kernel.org # 6.1 and later Fixes: 724a75ac9542 ("arm64: rust: Enable Rust support for AArch64") Signed-off-by: Alice Ryhl Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20240729-shadow-call-stack-v4-1-2a664b082ea4@google.com Signed-off-by: Catalin Marinas --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index a465ea9525bd..37260d17267e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1902,6 +1902,7 @@ config RUST depends on !MODVERSIONS depends on !GCC_PLUGINS depends on !RANDSTRUCT + depends on !SHADOW_CALL_STACK depends on !DEBUG_INFO_BTF || PAHOLE_HAS_LANG_EXCLUDE help Enables Rust support in the kernel. From 33eb1e5db351e2c0e652d878b66b8a6d4d013135 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 8 Mar 2024 13:40:30 +1030 Subject: [PATCH 0431/1052] btrfs: factor out stripe length calculation into a helper Currently there are two locations which need to calculate the real length of a stripe (which can be at the end of a chunk, and the chunk size may not always be 64K aligned). Factor them into a helper as we're going to have a third user soon. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 14a8d7100018..439545710d76 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1648,14 +1648,20 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) } } +static u32 stripe_length(const struct scrub_stripe *stripe) +{ + ASSERT(stripe->bg); + + return min(BTRFS_STRIPE_LEN, + stripe->bg->start + stripe->bg->length - stripe->logical); +} + static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; - unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + - stripe->bg->length - stripe->logical) >> - fs_info->sectorsize_bits; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1729,9 +1735,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; - unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + - stripe->bg->length - stripe->logical) >> - fs_info->sectorsize_bits; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); From 63447b7dd40c6a9ae8d3bb70c11f4c46731823e3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 8 Mar 2024 13:40:31 +1030 Subject: [PATCH 0432/1052] btrfs: scrub: update last_physical after scrubbing one stripe Currently sctx->stat.last_physical only got updated in the following cases: - When the last stripe of a non-RAID56 chunk is scrubbed This implies a pitfall, if the last stripe is at the chunk boundary, and we finished the scrub of the whole chunk, we won't update last_physical at all until the next chunk. - When a P/Q stripe of a RAID56 chunk is scrubbed This leads the following two problems: - sctx->stat.last_physical is not updated for a almost full chunk This is especially bad, affecting scrub resume, as the resume would start from last_physical, causing unnecessary re-scrub. - "btrfs scrub status" will not report any progress for a long time Fix the problem by properly updating @last_physical after each stripe is scrubbed. And since we're here, for the sake of consistency, use spin lock to protect the update of @last_physical, just like all the remaining call sites touching sctx->stat. Reported-by: Michel Palleau Link: https://lore.kernel.org/linux-btrfs/CAMFk-+igFTv2E8svg=cQ6o3e6CrR5QwgQ3Ok9EyRaEvvthpqCQ@mail.gmail.com/ Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 439545710d76..0de9162ff481 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1875,6 +1875,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) stripe = &sctx->stripes[i]; wait_scrub_stripe_io(stripe); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = stripe->physical + stripe_length(stripe); + spin_unlock(&sctx->stat_lock); scrub_reset_stripe(stripe); } out: @@ -2143,7 +2146,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ + spin_lock(&sctx->stat_lock); sctx->stat.last_physical = physical + logical_length; + spin_unlock(&sctx->stat_lock); ret = 0; break; } @@ -2340,6 +2345,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, stripe_logical += chunk_logical; ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, map, stripe_logical); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, + physical_end); + spin_unlock(&sctx->stat_lock); if (ret) goto out; goto next; From 872617a0896fc7510b0b8f25d323670424461cfc Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 18 Jul 2024 14:46:23 -0700 Subject: [PATCH 0433/1052] btrfs: implement launder_folio for clearing dirty page reserve In the buffered write path, dirty pages can be said to "own" the qgroup reservation until they create an ordered_extent. It is possible for there to be outstanding dirty pages when a transaction is aborted, in which case there is no cancellation path for freeing this reservation and it is leaked. We do already walk the list of outstanding delalloc inodes in btrfs_destroy_delalloc_inodes() and call invalidate_inode_pages2() on them. This does *not* call btrfs_invalidate_folio(), as one might guess, but rather calls launder_folio() and release_folio(). Since this is a reservation associated with dirty pages only, rather than something associated with the private bit (ordered_extent is cancelled separately already in the cleanup transaction path), implementing this release should be done via launder_folio. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d4e0a65494a..1659ed3a0ba9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7202,6 +7202,12 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } +static int btrfs_launder_folio(struct folio *folio) +{ + return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), + PAGE_SIZE, NULL); +} + static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (try_release_extent_mapping(&folio->page, gfp_flags)) { @@ -10137,6 +10143,7 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, + .launder_folio = btrfs_launder_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, From 30479f31d44d47ed00ae0c7453d9b253537005b2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 19 Jul 2024 16:49:08 -0700 Subject: [PATCH 0434/1052] btrfs: fix qgroup reserve leaks in cow_file_range In the buffered write path, the dirty page owns the qgroup reserve until it creates an ordered_extent. Therefore, any errors that occur before the ordered_extent is created must free that reservation, or else the space is leaked. The fstest generic/475 exercises various IO error paths, and is able to trigger errors in cow_file_range where we fail to get to allocating the ordered extent. Note that because we *do* clear delalloc, we are likely to remove the inode from the delalloc list, so the inodes/pages to not have invalidate/launder called on them in the commit abort path. This results in failures at the unmount stage of the test that look like: BTRFS: error (device dm-8 state EA) in cleanup_transaction:2018: errno=-5 IO failure BTRFS: error (device dm-8 state EA) in btrfs_replace_file_extents:2416: errno=-5 IO failure BTRFS warning (device dm-8 state EA): qgroup 0/5 has unreleased space, type 0 rsv 28672 ------------[ cut here ]------------ WARNING: CPU: 3 PID: 22588 at fs/btrfs/disk-io.c:4333 close_ctree+0x222/0x4d0 [btrfs] Modules linked in: btrfs blake2b_generic libcrc32c xor zstd_compress raid6_pq CPU: 3 PID: 22588 Comm: umount Kdump: loaded Tainted: G W 6.10.0-rc7-gab56fde445b8 #21 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:close_ctree+0x222/0x4d0 [btrfs] RSP: 0018:ffffb4465283be00 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffffa1a1818e1000 RCX: 0000000000000001 RDX: 0000000000000000 RSI: ffffb4465283bbe0 RDI: ffffa1a19374fcb8 RBP: ffffa1a1818e13c0 R08: 0000000100028b16 R09: 0000000000000000 R10: 0000000000000003 R11: 0000000000000003 R12: ffffa1a18ad7972c R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f9168312b80(0000) GS:ffffa1a4afcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f91683c9140 CR3: 000000010acaa000 CR4: 00000000000006f0 Call Trace: ? close_ctree+0x222/0x4d0 [btrfs] ? __warn.cold+0x8e/0xea ? close_ctree+0x222/0x4d0 [btrfs] ? report_bug+0xff/0x140 ? handle_bug+0x3b/0x70 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? close_ctree+0x222/0x4d0 [btrfs] generic_shutdown_super+0x70/0x160 kill_anon_super+0x11/0x40 btrfs_kill_super+0x11/0x20 [btrfs] deactivate_locked_super+0x2e/0xa0 cleanup_mnt+0xb5/0x150 task_work_run+0x57/0x80 syscall_exit_to_user_mode+0x121/0x130 do_syscall_64+0xab/0x1a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f916847a887 ---[ end trace 0000000000000000 ]--- BTRFS error (device dm-8 state EA): qgroup reserved space leaked Cases 2 and 3 in the out_reserve path both pertain to this type of leak and must free the reserved qgroup data. Because it is already an error path, I opted not to handle the possible errors in btrfs_free_qgroup_data. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1659ed3a0ba9..ceee5422089b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1585,6 +1585,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, locked_page, &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); start += cur_alloc_size; } @@ -1598,6 +1599,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, clear_bits |= EXTENT_CLEAR_DATA_RESV; extent_clear_unlock_delalloc(inode, start, end, locked_page, &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } return ret; } @@ -2259,6 +2261,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); return ret; From 1e7bec1f7d6533f08bc1c4ee94930c02361db86c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 29 Jul 2024 11:05:48 -0400 Subject: [PATCH 0435/1052] btrfs: emit a warning about space cache v1 being deprecated We've been wanting to get rid of this for a while, add a message to indicate that this feature is going away and when so we can finally have a date when we're going to remove it. The output looks like this BTRFS warning (device nvme0n1): space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2 Reviewed-by: Qu Wenruo Reviewed-by: Neal Gompa Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 08d33cb372fb..83478deada3b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -683,8 +683,11 @@ bool btrfs_check_options(const struct btrfs_fs_info *info, ret = false; if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { - if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_info(info, "disk space caching is enabled"); + btrfs_warn(info, +"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); + } if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) btrfs_info(info, "using free-space-tree"); } From 00f89ae4e759a7eef07e4188e1534af7dd2c7e9c Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Thu, 25 Jul 2024 14:07:30 +0200 Subject: [PATCH 0436/1052] PCI: Fix devres regression in pci_intx() pci_intx() becomes managed if pcim_enable_device() has been called in advance. Commit 25216afc9db5 ("PCI: Add managed pcim_intx()") changed this behavior so that pci_intx() always leads to creation of a separate device resource for itself, whereas earlier, a shared resource was used for all PCI devres operations. Unfortunately, pci_intx() seems to be used in some drivers' remove() paths; in the managed case this causes a device resource to be created on driver detach, which causes .probe() to fail if the driver is reloaded: pci 0000:00:1f.2: Resources present before probing Fix the regression by only redirecting pci_intx() to its managed twin pcim_intx() if the pci_command changes. Link: https://lore.kernel.org/r/20240725120729.59788-2-pstanner@redhat.com Fixes: 25216afc9db5 ("PCI: Add managed pcim_intx()") Reported-by: Damien Le Moal Closes: https://lore.kernel.org/all/b8f4ba97-84fc-4b7e-ba1a-99de2d9f0118@kernel.org/ Signed-off-by: Philipp Stanner [bhelgaas: add error message to commit log] Signed-off-by: Bjorn Helgaas Tested-by: Damien Le Moal --- drivers/pci/pci.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e3a49f66982d..ffaaca0978cb 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4477,12 +4477,6 @@ void pci_intx(struct pci_dev *pdev, int enable) { u16 pci_command, new; - /* Preserve the "hybrid" behavior for backwards compatibility */ - if (pci_is_managed(pdev)) { - WARN_ON_ONCE(pcim_intx(pdev, enable) != 0); - return; - } - pci_read_config_word(pdev, PCI_COMMAND, &pci_command); if (enable) @@ -4490,8 +4484,15 @@ void pci_intx(struct pci_dev *pdev, int enable) else new = pci_command | PCI_COMMAND_INTX_DISABLE; - if (new != pci_command) + if (new != pci_command) { + /* Preserve the "hybrid" behavior for backwards compatibility */ + if (pci_is_managed(pdev)) { + WARN_ON_ONCE(pcim_intx(pdev, enable) != 0); + return; + } + pci_write_config_word(pdev, PCI_COMMAND, new); + } } EXPORT_SYMBOL_GPL(pci_intx); From 5560a612c20d3daacbf5da7913deefa5c31742f4 Mon Sep 17 00:00:00 2001 From: Blazej Kucman Date: Mon, 22 Jul 2024 16:14:40 +0200 Subject: [PATCH 0437/1052] PCI: pciehp: Retain Power Indicator bits for userspace indicators The sysfs "attention" file normally controls the Slot Control Attention Indicator with 0 (off), 1 (on), 2 (blink) settings. 576243b3f9ea ("PCI: pciehp: Allow exclusive userspace control of indicators") added pciehp_set_raw_indicator_status() to allow userspace to directly control all four bits in both the Attention Indicator and the Power Indicator fields via the "attention" file. This is used on Intel VMD bridges so utilities like "ledmon" can use sysfs "attention" to control up to 16 indicators for NVMe device RAID status. abaaac4845a0 ("PCI: hotplug: Use FIELD_GET/PREP()") broke this by masking the sysfs data with PCI_EXP_SLTCTL_AIC, which discards the upper two bits intended for the Power Indicator Control field (PCI_EXP_SLTCTL_PIC). For NVMe devices behind an Intel VMD, ledmon settings that use the PCI_EXP_SLTCTL_PIC bits, i.e., ATTENTION_REBUILD (0x5), ATTENTION_LOCATE (0x7), ATTENTION_FAILURE (0xD), ATTENTION_OFF (0xF), no longer worked correctly. Mask with PCI_EXP_SLTCTL_AIC | PCI_EXP_SLTCTL_PIC to retain both the Attention Indicator and the Power Indicator bits. Fixes: abaaac4845a0 ("PCI: hotplug: Use FIELD_GET/PREP()") Link: https://lore.kernel.org/r/20240722141440.7210-1-blazej.kucman@intel.com Signed-off-by: Blazej Kucman [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v6.7+ --- drivers/pci/hotplug/pciehp_hpc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 061f01f60db4..736ad8baa2a5 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -485,7 +485,9 @@ int pciehp_set_raw_indicator_status(struct hotplug_slot *hotplug_slot, struct pci_dev *pdev = ctrl_dev(ctrl); pci_config_pm_runtime_get(pdev); - pcie_write_cmd_nowait(ctrl, FIELD_PREP(PCI_EXP_SLTCTL_AIC, status), + + /* Attention and Power Indicator Control bits are supported */ + pcie_write_cmd_nowait(ctrl, FIELD_PREP(PCI_EXP_SLTCTL_AIC | PCI_EXP_SLTCTL_PIC, status), PCI_EXP_SLTCTL_AIC | PCI_EXP_SLTCTL_PIC); pci_config_pm_runtime_put(pdev); return 0; From aca0ec970d7644954caf01a40c5538d4400c01bd Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Thu, 1 Aug 2024 17:39:55 +0000 Subject: [PATCH 0438/1052] KVM: x86/mmu: fix determination of max NPT mapping level for private pages The `if (req_max_level)` test was meant ignore req_max_level if PG_LEVEL_NONE was returned. Hence, this function should return max_level instead of the ignored req_max_level. This is only a latent issue for now, since guest_memfd does not support large pages. Signed-off-by: Ackerley Tng Message-ID: <20240801173955.1975034-1-ackerleytng@google.com> Fixes: f32fb32820b1 ("KVM: x86: Add hook for determining max NPT mapping level") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 43a02891babb..928cf84778b0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4335,7 +4335,7 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, if (req_max_level) max_level = min(max_level, req_max_level); - return req_max_level; + return max_level; } static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, From 3b6564427aea83b7a35a15ca278291d50a1edcfc Mon Sep 17 00:00:00 2001 From: Stuart Menefy Date: Sat, 22 Jun 2024 12:42:16 +0100 Subject: [PATCH 0439/1052] riscv: Fix linear mapping checks for non-contiguous memory regions The RISC-V kernel already has checks to ensure that memory which would lie outside of the linear mapping is not used. However those checks use memory_limit, which is used to implement the mem= kernel command line option (to limit the total amount of memory, not its address range). When memory is made up of two or more non-contiguous memory banks this check is incorrect. Two changes are made here: - add a call in setup_bootmem() to memblock_cap_memory_range() which will cause any memory which falls outside the linear mapping to be removed from the memory regions. - remove the check in create_linear_mapping_page_table() which was intended to remove memory which is outside the liner mapping based on memory_limit, as it is no longer needed. Note a check for mapping more memory than memory_limit (to implement mem=) is unnecessary because of the existing call to memblock_enforce_memory_limit(). This issue was seen when booting on a SV39 platform with two memory banks: 0x00,80000000 1GiB 0x20,00000000 32GiB This memory range is 158GiB from top to bottom, but the linear mapping is limited to 128GiB, so the lower block of RAM will be mapped at PAGE_OFFSET, and the upper block straddles the top of the linear mapping. This causes the following Oops: [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22 11:34:22 BST 2024 [ 0.000000] memblock_add: [0x0000000080000000-0x00000000bfffffff] early_init_dt_add_memory_arch+0x4a/0x52 [ 0.000000] memblock_add: [0x0000002000000000-0x00000027ffffffff] early_init_dt_add_memory_arch+0x4a/0x52 ... [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_init_dt_alloc_memory_arch+0x1e/0x48 [ 0.000000] memblock_reserve: [0x00000027ffff5350-0x00000027ffffaffb] memblock_alloc_range_nid+0xb8/0x132 [ 0.000000] Unable to handle kernel paging request at virtual address fffffffe7fff5350 [ 0.000000] Oops [#1] [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc2-gd3b8dd5b51dd-dirty #20 [ 0.000000] Hardware name: codasip,a70x (DT) [ 0.000000] epc : __memset+0x8c/0x104 [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84 [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp : ffffffff80e03d50 [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 : fffffffe7fff52f8 [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 : ffffffff80e03d90 [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 : 0000000000000000 [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 : 000000000000002c [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 : 0000000000000030 [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 : 0000000000000000 [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 : 0000000000000001 [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10: 0000000000000000 [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 : ffffffff8080a928 [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940 [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350 cause: 000000000000000f [ 0.000000] [] __memset+0x8c/0x104 [ 0.000000] [] early_init_dt_alloc_memory_arch+0x1e/0x48 [ 0.000000] [] __unflatten_device_tree+0x52/0x114 [ 0.000000] [] unflatten_device_tree+0x9e/0xb8 [ 0.000000] [] setup_arch+0xd4/0x5bc [ 0.000000] [] start_kernel+0x76/0x81a [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823 04b2 (bc23) 04b2 [ 0.000000] ---[ end trace 0000000000000000 ]--- [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task! [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]--- The problem is that memblock (unaware that some physical memory cannot be used) has allocated memory from the top of memory but which is outside the linear mapping region. Signed-off-by: Stuart Menefy Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not use the kernel mapping") Reviewed-by: David McKay Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240622114217.2158495-1-stuart.menefy@codasip.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index bfa2dea95354..8b698d9609e7 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -234,8 +234,6 @@ static void __init setup_bootmem(void) */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); - phys_ram_end = memblock_end_of_DRAM(); - /* * Make sure we align the start of the memory on a PMD boundary so that * at worst, we map the linear mapping with PMD mappings. @@ -250,6 +248,16 @@ static void __init setup_bootmem(void) if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base; + /* + * The size of the linear page mapping may restrict the amount of + * usable RAM. + */ + if (IS_ENABLED(CONFIG_64BIT)) { + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE; + memblock_cap_memory_range(phys_ram_base, + max_mapped_addr - phys_ram_base); + } + /* * Reserve physical address space that would be mapped to virtual * addresses greater than (void *)(-PAGE_SIZE) because: @@ -266,6 +274,7 @@ static void __init setup_bootmem(void) memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr); } + phys_ram_end = memblock_end_of_DRAM(); min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); @@ -1284,8 +1293,6 @@ static void __init create_linear_mapping_page_table(void) if (start <= __pa(PAGE_OFFSET) && __pa(PAGE_OFFSET) < end) start = __pa(PAGE_OFFSET); - if (end >= __pa(PAGE_OFFSET) + memory_limit) - end = __pa(PAGE_OFFSET) + memory_limit; create_linear_mapping_range(start, end, 0, NULL); } From dd35a0933269c636635b6af89dc6fa1782791e56 Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 31 Jul 2024 15:30:29 +0800 Subject: [PATCH 0440/1052] x86/uaccess: Zero the 8-byte get_range case on failure on 32-bit While zeroing the upper 32 bits of an 8-byte getuser on 32-bit x86 was fixed by commit 8c860ed825cb ("x86/uaccess: Fix missed zeroing of ia32 u64 get_user() range checking") it was broken again in commit 8a2462df1547 ("x86/uaccess: Improve the 8-byte getuser() case"). This is because the register which holds the upper 32 bits (%ecx) is being cleared _after_ the check_range, so if the range check fails, %ecx is never cleared. This can be reproduced with: ./tools/testing/kunit/kunit.py run --arch i386 usercopy Instead, clear %ecx _before_ check_range in the 8-byte case. This reintroduces a bit of the ugliness we were trying to avoid by adding another #ifndef CONFIG_X86_64, but at least keeps check_range from needing a separate bad_get_user_8 jump. Fixes: 8a2462df1547 ("x86/uaccess: Improve the 8-byte getuser() case") Signed-off-by: David Gow Signed-off-by: Thomas Gleixner Acked-by: Linus Torvalds Link: https://lore.kernel.org/all/20240731073031.4045579-1-davidgow@google.com --- arch/x86/lib/getuser.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index a314622aa093..d066aecf8aeb 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -88,12 +88,14 @@ SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) +#ifndef CONFIG_X86_64 + xor %ecx,%ecx +#endif check_range size=8 ASM_STAC #ifdef CONFIG_X86_64 UACCESS movq (%_ASM_AX),%rdx #else - xor %ecx,%ecx UACCESS movl (%_ASM_AX),%edx UACCESS movl 4(%_ASM_AX),%ecx #endif From 8aa37bde1a7b645816cda8b80df4753ecf172bf1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Aug 2024 15:22:22 -0400 Subject: [PATCH 0441/1052] protect the fetch of ->fd[fd] in do_dup2() from mispredictions both callers have verified that fd is not greater than ->max_fds; however, misprediction might end up with tofree = fdt->fd[fd]; being speculatively executed. That's wrong for the same reasons why it's wrong in close_fd()/file_close_fd_locked(); the same solution applies - array_index_nospec(fd, fdt->max_fds) could differ from fd only in case of speculative execution on mispredicted path. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/file.c b/fs/file.c index a3b72aa64f11..a11e59b5d602 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1248,6 +1248,7 @@ __releases(&files->file_lock) * tables and this condition does not arise without those. */ fdt = files_fdtable(files); + fd = array_index_nospec(fd, fdt->max_fds); tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; From d67c5649c1541dc93f202eeffc6f49220a4ed71d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 31 Jul 2024 13:05:53 +0200 Subject: [PATCH 0442/1052] mptcp: fully established after ADD_ADDR echo on MPJ Before this patch, receiving an ADD_ADDR echo on the just connected MP_JOIN subflow -- initiator side, after the MP_JOIN 3WHS -- was resulting in an MP_RESET. That's because only ACKs with a DSS or ADD_ADDRs without the echo bit were allowed. Not allowing the ADD_ADDR echo after an MP_CAPABLE 3WHS makes sense, as we are not supposed to send an ADD_ADDR before because it requires to be in full established mode first. For the MP_JOIN 3WHS, that's different: the ADD_ADDR can be sent on a previous subflow, and the ADD_ADDR echo can be received on the recently created one. The other peer will already be in fully established, so it is allowed to send that. We can then relax the conditions here to accept the ADD_ADDR echo for MPJ subflows. Fixes: 67b12f792d5e ("mptcp: full fully established support after ADD_ADDR") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-1-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8a68382a4fe9..ac2f1a54cc43 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -958,7 +958,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || - ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) { + ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && + (!mp_opt->echo || subflow->mp_join)))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ From 8af1f11865f259c882cce71d32f85ee9004e2660 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 31 Jul 2024 13:05:54 +0200 Subject: [PATCH 0443/1052] mptcp: pm: deny endp with signal + subflow + port As mentioned in the 'Fixes' commit, the port flag is only supported by the 'signal' flag, and not by the 'subflow' one. Then if both the 'signal' and 'subflow' flags are set, the problem is the same: the feature cannot work with the 'subflow' flag. Technically, if both the 'signal' and 'subflow' flags are set, it will be possible to create the listening socket, but not to establish a subflow using this source port. So better to explicitly deny it, not to create some confusions because the expected behaviour is not possible. Fixes: 09f12c3ab7a5 ("mptcp: allow to use port and non-signal in set_flags") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-2-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 37954a0b087d..c921d07e5940 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1328,8 +1328,8 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; - if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - GENL_SET_ERR_MSG(info, "flags must have signal when using port"); + if (addr.addr.port && !address_use_port(&addr)) { + GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port"); return -EINVAL; } From c95eb32ced823a00be62202b43966b07b2f20b7f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 31 Jul 2024 13:05:55 +0200 Subject: [PATCH 0444/1052] mptcp: pm: reduce indentation blocks That will simplify the following commits. No functional changes intended. Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-3-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c921d07e5940..780f4cca165c 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -567,16 +567,19 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; - if (local) { - if (mptcp_pm_alloc_anno_list(msk, &local->addr)) { - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false); - mptcp_pm_nl_addr_send_ack(msk); - } - } + if (!local) + goto subflow; + + if (!mptcp_pm_alloc_anno_list(msk, &local->addr)) + goto subflow; + + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled++; + mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_nl_addr_send_ack(msk); } +subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && msk->pm.subflows < subflows_max) { From cd7c957f936f8cb80d03e5152f4013aae65bd986 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 31 Jul 2024 13:05:56 +0200 Subject: [PATCH 0445/1052] mptcp: pm: don't try to create sf if alloc failed It sounds better to avoid wasting cycles and / or put extreme memory pressure on the system by trying to create new subflows if it was not possible to add a new item in the announce list. While at it, a warning is now printed if the entry was already in the list as it should not happen with the in-kernel path-manager. With this PM, mptcp_pm_alloc_anno_list() should only fail in case of memory pressure. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-4-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 780f4cca165c..2be7af377cda 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -348,7 +348,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { - if (mptcp_pm_is_kernel(msk)) + if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; sk_reset_timer(sk, &add_entry->add_timer, @@ -555,8 +555,6 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { - local = select_signal_address(pernet, msk); - /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -567,11 +565,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; + local = select_signal_address(pernet, msk); if (!local) goto subflow; + /* If the alloc fails, we are on memory pressure, not worth + * continuing, and trying to create subflows. + */ if (!mptcp_pm_alloc_anno_list(msk, &local->addr)) - goto subflow; + return; __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; From 85df533a787bf07bf4367ce2a02b822ff1fba1a3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 31 Jul 2024 13:05:57 +0200 Subject: [PATCH 0446/1052] mptcp: pm: do not ignore 'subflow' if 'signal' flag is also set Up to the 'Fixes' commit, having an endpoint with both the 'signal' and 'subflow' flags, resulted in the creation of a subflow and an address announcement using the address linked to this endpoint. After this commit, only the address announcement was done, ignoring the 'subflow' flag. That's because the same bitmap is used for the two flags. It is OK to keep this single bitmap, the already selected local endpoint simply have to be re-used, but not via select_local_address() not to look at the just modified bitmap. Note that it is unusual to set the two flags together: creating a new subflow using a new local address will implicitly advertise it to the other peer. So in theory, no need to advertise it explicitly as well. Maybe there are use-cases -- the subflow might not reach the other peer that way, we can ask the other peer to try initiating the new subflow without delay -- or very likely the user is confused, and put both flags "just to be sure at least the right one is set". Still, if it is allowed, the kernel should do what has been asked: using this endpoint to announce the address and to create a new subflow from it. An alternative is to forbid the use of the two flags together, but that's probably too late, there are maybe use-cases, and it was working before. This patch will avoid people complaining subflows are not created using the endpoint they added with the 'subflow' and 'signal' flag. Note that with the current patch, the subflow might not be created in some corner cases, e.g. if the 'subflows' limit was reached when sending the ADD_ADDR, but changed later on. It is probably not worth splitting id_avail_bitmap per target ('signal', 'subflow'), which will add another large field to the msk "just" to track (again) endpoints. Anyway, currently when the limits are changed, the kernel doesn't check if new subflows can be created or removed, because we would need to keep track of the received ADD_ADDR, and more. It sounds OK to assume that the limits should be properly configured before establishing new connections. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-5-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2be7af377cda..4cae2aa7be5c 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -512,8 +512,8 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { + struct mptcp_pm_addr_entry *local, *signal_and_subflow = NULL; struct sock *sk = (struct sock *)msk; - struct mptcp_pm_addr_entry *local; unsigned int add_addr_signal_max; unsigned int local_addr_max; struct pm_nl_pernet *pernet; @@ -579,6 +579,9 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &local->addr, false); mptcp_pm_nl_addr_send_ack(msk); + + if (local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + signal_and_subflow = local; } subflow: @@ -589,9 +592,14 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) bool fullmesh; int i, nr; - local = select_local_address(pernet, msk); - if (!local) - break; + if (signal_and_subflow) { + local = signal_and_subflow; + signal_and_subflow = NULL; + } else { + local = select_local_address(pernet, msk); + if (!local) + break; + } fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); From bec1f3b119ebc613d08dfbcdbaef01a79aa7de92 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 31 Jul 2024 13:05:58 +0200 Subject: [PATCH 0447/1052] selftests: mptcp: join: ability to invert ADD_ADDR check In the following commit, the client will initiate the ADD_ADDR, instead of the server. We need to way to verify the ADD_ADDR have been correctly sent. Note: the default expected counters for when the port number is given are never changed by the caller, no need to accept them as parameter then. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-6-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 4df48f1f14ab..52a25ac43d10 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1415,18 +1415,28 @@ chk_add_nr() local add_nr=$1 local echo_nr=$2 local port_nr=${3:-0} - local syn_nr=${4:-$port_nr} - local syn_ack_nr=${5:-$port_nr} - local ack_nr=${6:-$port_nr} - local mis_syn_nr=${7:-0} - local mis_ack_nr=${8:-0} + local ns_invert=${4:-""} + local syn_nr=$port_nr + local syn_ack_nr=$port_nr + local ack_nr=$port_nr + local mis_syn_nr=0 + local mis_ack_nr=0 + local ns_tx=$ns1 + local ns_rx=$ns2 + local extra_msg="" local count local timeout - timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout) + if [[ $ns_invert = "invert" ]]; then + ns_tx=$ns2 + ns_rx=$ns1 + extra_msg="invert" + fi + + timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout) print_check "add" - count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtAddAddr") + count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr") if [ -z "$count" ]; then print_skip # if the test configured a short timeout tolerate greater then expected @@ -1438,7 +1448,7 @@ chk_add_nr() fi print_check "echo" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtEchoAdd") + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd") if [ -z "$count" ]; then print_skip elif [ "$count" != "$echo_nr" ]; then @@ -1449,7 +1459,7 @@ chk_add_nr() if [ $port_nr -gt 0 ]; then print_check "pt" - count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtPortAdd") + count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd") if [ -z "$count" ]; then print_skip elif [ "$count" != "$port_nr" ]; then @@ -1459,7 +1469,7 @@ chk_add_nr() fi print_check "syn" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortSynRx") + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$syn_nr" ]; then @@ -1470,7 +1480,7 @@ chk_add_nr() fi print_check "synack" - count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinPortSynAckRx") + count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$syn_ack_nr" ]; then @@ -1481,7 +1491,7 @@ chk_add_nr() fi print_check "ack" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortAckRx") + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$ack_nr" ]; then @@ -1492,7 +1502,7 @@ chk_add_nr() fi print_check "syn" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortSynRx") + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$mis_syn_nr" ]; then @@ -1503,7 +1513,7 @@ chk_add_nr() fi print_check "ack" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortAckRx") + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$mis_ack_nr" ]; then @@ -1513,6 +1523,8 @@ chk_add_nr() print_ok fi fi + + print_info "$extra_msg" } chk_add_tx_nr() From 4d2868b5d191c74262f7407972d68d1bf3245d6a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 31 Jul 2024 13:05:59 +0200 Subject: [PATCH 0448/1052] selftests: mptcp: join: test both signal & subflow It should be quite uncommon to set both the subflow and the signal flags: the initiator of the connection is typically the one creating new subflows, not the other peer, then no need to announce additional local addresses, and use it to create subflows. But some people might be confused about the flags, and set both "just to be sure at least the right one is set". To verify the previous fix, and avoid future regressions, this specific case is now validated: the client announces a new address, and initiates a new subflow from the same address. While working on this, another bug has been noticed, where the client reset the new subflow because an ADD_ADDR echo got received as the 3rd ACK: this new test also explicitly checks that no RST have been sent by the client and server. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-7-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 52a25ac43d10..9ea6d698e9d3 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1989,6 +1989,21 @@ signal_address_tests() chk_add_nr 1 1 fi + # uncommon: subflow and signal flags on the same endpoint + # or because the user wrongly picked both, but still expects the client + # to create additional subflows + if reset "subflow and signal together"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 0 2 + pm_nl_add_endpoint $ns2 10.0.3.2 flags signal,subflow + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 0 invert # only initiated by ns2 + chk_add_nr 0 0 0 # none initiated by ns1 + chk_rst_nr 0 0 invert # no RST sent by the client + chk_rst_nr 0 0 # no RST sent by the server + fi + # accept and use add_addr with additional subflows if reset "multiple subflows and signal"; then pm_nl_set_limits $ns1 0 3 From ab9fd06cb8f0db0854291833fc40c789e43a361f Mon Sep 17 00:00:00 2001 From: Vamshi Gajjela Date: Wed, 24 Jul 2024 19:21:26 +0530 Subject: [PATCH 0449/1052] scsi: ufs: core: Fix hba->last_dme_cmd_tstamp timestamp updating logic The ufshcd_add_delay_before_dme_cmd() always introduces a delay of MIN_DELAY_BEFORE_DME_CMDS_US between DME commands even when it's not required. The delay is added when the UFS host controller supplies the quirk UFSHCD_QUIRK_DELAY_BEFORE_DME_CMDS. Fix the logic to update hba->last_dme_cmd_tstamp to ensure subsequent DME commands have the correct delay in the range of 0 to MIN_DELAY_BEFORE_DME_CMDS_US. Update the timestamp at the end of the function to ensure it captures the latest time after any necessary delay has been applied. Signed-off-by: Vamshi Gajjela Link: https://lore.kernel.org/r/20240724135126.1786126-1-vamshigajjela@google.com Fixes: cad2e03d8607 ("ufs: add support to allow non standard behaviours (quirks)") Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 5e3c67e96956..0b3d0c8e0dda 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -4100,11 +4100,16 @@ static inline void ufshcd_add_delay_before_dme_cmd(struct ufs_hba *hba) min_sleep_time_us = MIN_DELAY_BEFORE_DME_CMDS_US - delta; else - return; /* no more delay required */ + min_sleep_time_us = 0; /* no more delay required */ } - /* allow sleep for extra 50us if needed */ - usleep_range(min_sleep_time_us, min_sleep_time_us + 50); + if (min_sleep_time_us > 0) { + /* allow sleep for extra 50us if needed */ + usleep_range(min_sleep_time_us, min_sleep_time_us + 50); + } + + /* update the last_dme_cmd_tstamp */ + hba->last_dme_cmd_tstamp = ktime_get(); } /** From ffed586b8c4f1fdb772ee350e229863f145defb5 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 1 Aug 2024 14:42:34 +0900 Subject: [PATCH 0450/1052] scsi: sd: Move sd_read_cpr() out of the q->limits_lock region Commit 804e498e0496 ("sd: convert to the atomic queue limits API") introduced pairs of function calls to queue_limits_start_update() and queue_limits_commit_update(). These two functions lock and unlock q->limits_lock. In sd_revalidate_disk(), sd_read_cpr() is called after queue_limits_start_update() call and before queue_limits_commit_update() call. sd_read_cpr() locks q->sysfs_dir_lock and &q->sysfs_lock. Then new lock dependencies were created between q->limits_lock, q->sysfs_dir_lock and q->sysfs_lock, as follows: sd_revalidate_disk queue_limits_start_update mutex_lock(&q->limits_lock) sd_read_cpr disk_set_independent_access_ranges mutex_lock(&q->sysfs_dir_lock) mutex_lock(&q->sysfs_lock) mutex_unlock(&q->sysfs_lock) mutex_unlock(&q->sysfs_dir_lock) queue_limits_commit_update mutex_unlock(&q->limits_lock) However, the three locks already had reversed dependencies in other places. Then the new dependencies triggered the lockdep WARN "possible circular locking dependency detected" [1]. This WARN was observed by running the blktests test case srp/002. To avoid the WARN, move the sd_read_cpr() call in sd_revalidate_disk() after the queue_limits_commit_update() call. In other words, move the sd_read_cpr() call out of the q->limits_lock region. [1] https://lore.kernel.org/linux-scsi/vlmv53ni3ltwxplig5qnw4xsl2h6ccxijfbqzekx76vxoim5a5@dekv7q3es3tx/ Fixes: 804e498e0496 ("sd: convert to the atomic queue limits API") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240801054234.540532-1-shinichiro.kawasaki@wdc.com Tested-by: Luca Coelho Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8bb3a3611851..718eb91ba9a5 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3753,7 +3753,6 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_block_limits_ext(sdkp); sd_read_block_characteristics(sdkp, &lim); sd_zbc_read_zones(sdkp, &lim, buffer); - sd_read_cpr(sdkp); } sd_print_capacity(sdkp, old_capacity); @@ -3808,6 +3807,14 @@ static int sd_revalidate_disk(struct gendisk *disk) if (err) return err; + /* + * Query concurrent positioning ranges after + * queue_limits_commit_update() unlocked q->limits_lock to avoid + * deadlock with q->sysfs_dir_lock and q->sysfs_lock. + */ + if (sdkp->media_present && scsi_device_supports_vpd(sdp)) + sd_read_cpr(sdkp); + /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk From 27ce65f65258cf2f2855162cbeef59659a81fac4 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 2 Aug 2024 14:38:28 +1000 Subject: [PATCH 0451/1052] Revert "nouveau: rip out busy fence waits" This reverts commit d45bb9c5f7a6f7b6e47939856b28cb1da0cdc119. Just got a report that this causes some suspend/resume issues, so back it out and I'll investigate it later. Reported-by: Mike Galbraith Signed-off-by: Dave Airlie --- drivers/gpu/drm/nouveau/nouveau_bo.c | 2 +- drivers/gpu/drm/nouveau/nouveau_chan.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- drivers/gpu/drm/nouveau/nouveau_fence.c | 30 ++++++++++++++++++++++++- drivers/gpu/drm/nouveau/nouveau_fence.h | 2 +- drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +- 6 files changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 0712d0b15170..70fb003a6666 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -898,7 +898,7 @@ nouveau_bo_move_m2mf(struct ttm_buffer_object *bo, int evict, * Without this the operation can timeout and we'll fallback to a * software copy, which might take several minutes to finish. */ - nouveau_fence_wait(fence, false); + nouveau_fence_wait(fence, false, false); ret = ttm_bo_move_accel_cleanup(bo, &fence->base, evict, false, new_reg); nouveau_fence_unref(&fence); diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index 66fca95c10c7..7c97b2886807 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -72,7 +72,7 @@ nouveau_channel_idle(struct nouveau_channel *chan) ret = nouveau_fence_new(&fence, chan); if (!ret) { - ret = nouveau_fence_wait(fence, false); + ret = nouveau_fence_wait(fence, false, false); nouveau_fence_unref(&fence); } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 6719353e2e13..6fb65b01d778 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -128,7 +128,7 @@ static void nouveau_dmem_page_free(struct page *page) static void nouveau_dmem_fence_done(struct nouveau_fence **fence) { if (fence) { - nouveau_fence_wait(*fence, false); + nouveau_fence_wait(*fence, true, false); nouveau_fence_unref(fence); } else { /* diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c index ba469767a20f..93f08f9479d8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c @@ -311,11 +311,39 @@ nouveau_fence_wait_legacy(struct dma_fence *f, bool intr, long wait) return timeout - t; } +static int +nouveau_fence_wait_busy(struct nouveau_fence *fence, bool intr) +{ + int ret = 0; + + while (!nouveau_fence_done(fence)) { + if (time_after_eq(jiffies, fence->timeout)) { + ret = -EBUSY; + break; + } + + __set_current_state(intr ? + TASK_INTERRUPTIBLE : + TASK_UNINTERRUPTIBLE); + + if (intr && signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + } + + __set_current_state(TASK_RUNNING); + return ret; +} + int -nouveau_fence_wait(struct nouveau_fence *fence, bool intr) +nouveau_fence_wait(struct nouveau_fence *fence, bool lazy, bool intr) { long ret; + if (!lazy) + return nouveau_fence_wait_busy(fence, intr); + ret = dma_fence_wait_timeout(&fence->base, intr, 15 * HZ); if (ret < 0) return ret; diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h index 1b63197b744a..8bc065acfe35 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.h +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h @@ -23,7 +23,7 @@ void nouveau_fence_unref(struct nouveau_fence **); int nouveau_fence_emit(struct nouveau_fence *); bool nouveau_fence_done(struct nouveau_fence *); -int nouveau_fence_wait(struct nouveau_fence *, bool intr); +int nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr); int nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr); struct nouveau_fence_chan { diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 2e535caa7d6e..5a887d67dc0e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -928,7 +928,7 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void *data, } if (sync) { - if (!(ret = nouveau_fence_wait(fence, false))) { + if (!(ret = nouveau_fence_wait(fence, false, false))) { if ((ret = dma_fence_get_status(&fence->base)) == 1) ret = 0; } From ab3de2c7ec91db6a3cf5fc07765852c81ca7d6ef Mon Sep 17 00:00:00 2001 From: Aapo Vienamo Date: Thu, 20 Jun 2024 13:43:03 +0300 Subject: [PATCH 0452/1052] thunderbolt: Fix memory leaks in {port|retimer}_sb_regs_write() Add missing free_page() call for the memory allocated by validate_and_copy_from_user(). Fixes: 6d241fa00159 ("thunderbolt: Add sideband register access to debugfs") Signed-off-by: Aapo Vienamo Reviewed-by: Przemek Kitszel Signed-off-by: Mika Westerberg --- drivers/thunderbolt/debugfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/thunderbolt/debugfs.c b/drivers/thunderbolt/debugfs.c index 11185cc1db92..9ed4bb2e8d05 100644 --- a/drivers/thunderbolt/debugfs.c +++ b/drivers/thunderbolt/debugfs.c @@ -323,16 +323,17 @@ static ssize_t port_sb_regs_write(struct file *file, const char __user *user_buf if (mutex_lock_interruptible(&tb->lock)) { ret = -ERESTARTSYS; - goto out_rpm_put; + goto out; } ret = sb_regs_write(port, port_sb_regs, ARRAY_SIZE(port_sb_regs), USB4_SB_TARGET_ROUTER, 0, buf, count, ppos); mutex_unlock(&tb->lock); -out_rpm_put: +out: pm_runtime_mark_last_busy(&sw->dev); pm_runtime_put_autosuspend(&sw->dev); + free_page((unsigned long)buf); return ret < 0 ? ret : count; } @@ -355,16 +356,17 @@ static ssize_t retimer_sb_regs_write(struct file *file, if (mutex_lock_interruptible(&tb->lock)) { ret = -ERESTARTSYS; - goto out_rpm_put; + goto out; } ret = sb_regs_write(rt->port, retimer_sb_regs, ARRAY_SIZE(retimer_sb_regs), USB4_SB_TARGET_RETIMER, rt->index, buf, count, ppos); mutex_unlock(&tb->lock); -out_rpm_put: +out: pm_runtime_mark_last_busy(&rt->dev); pm_runtime_put_autosuspend(&rt->dev); + free_page((unsigned long)buf); return ret < 0 ? ret : count; } From 113fd6372a5bb3689aba8ef5b8a265ed1529a78f Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Fri, 2 Aug 2024 12:47:36 +0800 Subject: [PATCH 0453/1052] drm/client: fix null pointer dereference in drm_client_modeset_probe In drm_client_modeset_probe(), the return value of drm_mode_duplicate() is assigned to modeset->mode, which will lead to a possible NULL pointer dereference on failure of drm_mode_duplicate(). Add a check to avoid npd. Cc: stable@vger.kernel.org Fixes: cf13909aee05 ("drm/fb-helper: Move out modeset config code") Signed-off-by: Ma Ke Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20240802044736.1570345-1-make24@iscas.ac.cn --- drivers/gpu/drm/drm_client_modeset.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c index 31af5cf37a09..cee5eafbfb81 100644 --- a/drivers/gpu/drm/drm_client_modeset.c +++ b/drivers/gpu/drm/drm_client_modeset.c @@ -880,6 +880,11 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, kfree(modeset->mode); modeset->mode = drm_mode_duplicate(dev, mode); + if (!modeset->mode) { + ret = -ENOMEM; + break; + } + drm_connector_get(connector); modeset->connectors[modeset->num_connectors++] = connector; modeset->x = offset->x; From 62b45bab010d1b0cea6166f818f1cd0666a6d8d8 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 15 Jul 2024 18:35:51 +1000 Subject: [PATCH 0454/1052] drm/test: fix the gem shmem test to map the sg table. The test here creates an sg table, but never maps it, when we get to drm_gem_shmem_free, the helper tries to unmap and this causes warnings on some platforms and debug kernels. This also sets a 64-bit dma mask, as I see an swiotlb warning if I stick with the default 32-bit one. Fixes: 93032ae634d4 ("drm/test: add a test suite for GEM objects backed by shmem") Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie Reviewed-by: Michael J. Ruhl Acked-by: Daniel Vetter Reviewed-by: Marco Pagani Link: https://patchwork.freedesktop.org/patch/msgid/20240715083551.777807-1-airlied@gmail.com Signed-off-by: Maxime Ripard --- drivers/gpu/drm/tests/drm_gem_shmem_test.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_gem_shmem_test.c b/drivers/gpu/drm/tests/drm_gem_shmem_test.c index c3758faa1b83..d8d0e4d1682f 100644 --- a/drivers/gpu/drm/tests/drm_gem_shmem_test.c +++ b/drivers/gpu/drm/tests/drm_gem_shmem_test.c @@ -102,6 +102,17 @@ static void drm_gem_shmem_test_obj_create_private(struct kunit *test) sg_init_one(sgt->sgl, buf, TEST_SIZE); + /* + * Set the DMA mask to 64-bits and map the sgtables + * otherwise drm_gem_shmem_free will cause a warning + * on debug kernels. + */ + ret = dma_set_mask(drm_dev->dev, DMA_BIT_MASK(64)); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = dma_map_sgtable(drm_dev->dev, sgt, DMA_BIDIRECTIONAL, 0); + KUNIT_ASSERT_EQ(test, ret, 0); + /* Init a mock DMA-BUF */ buf_mock.size = TEST_SIZE; attach_mock.dmabuf = &buf_mock; From d1aa95e86f178dc597e80228cd9bd81fc3510f34 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Thu, 25 Jul 2024 10:31:25 +1200 Subject: [PATCH 0455/1052] hid-asus: add ROG Ally X prod ID to quirk list The new ASUS ROG Ally X functions almost exactly the same as the previous model, so we can use the same quirks. Signed-off-by: Luke D. Jones Signed-off-by: Jiri Kosina --- drivers/hid/hid-asus.c | 3 +++ drivers/hid/hid-ids.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index 37e6d25593c2..a282388b7aa5 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -1248,6 +1248,9 @@ static const struct hid_device_id asus_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY), QUIRK_USE_KBD_BACKLIGHT | QUIRK_ROG_NKEY_KEYBOARD }, + { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, + USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X), + QUIRK_USE_KBD_BACKLIGHT | QUIRK_ROG_NKEY_KEYBOARD }, { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD), QUIRK_ROG_CLAYMORE_II_KEYBOARD }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 72d56ee7ce1b..6e3223389080 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -210,6 +210,7 @@ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3 0x1a30 #define USB_DEVICE_ID_ASUSTEK_ROG_Z13_LIGHTBAR 0x18c6 #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY 0x1abe +#define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X 0x1b4c #define USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD 0x196b #define USB_DEVICE_ID_ASUSTEK_FX503VD_KEYBOARD 0x1869 From 97155021ae17b86985121b33cf8098bcde00d497 Mon Sep 17 00:00:00 2001 From: Olivier Sobrie Date: Tue, 23 Jul 2024 10:44:35 +0200 Subject: [PATCH 0456/1052] HID: amd_sfh: free driver_data after destroying hid device HID driver callbacks aren't called anymore once hid_destroy_device() has been called. Hence, hid driver_data should be freed only after the hid_destroy_device() function returned as driver_data is used in several callbacks. I observed a crash with kernel 6.10.0 on my T14s Gen 3, after enabling KASAN to debug memory allocation, I got this output: [ 13.050438] ================================================================== [ 13.054060] BUG: KASAN: slab-use-after-free in amd_sfh_get_report+0x3ec/0x530 [amd_sfh] [ 13.054809] psmouse serio1: trackpoint: Synaptics TrackPoint firmware: 0x02, buttons: 3/3 [ 13.056432] Read of size 8 at addr ffff88813152f408 by task (udev-worker)/479 [ 13.060970] CPU: 5 PID: 479 Comm: (udev-worker) Not tainted 6.10.0-arch1-2 #1 893bb55d7f0073f25c46adbb49eb3785fefd74b0 [ 13.063978] Hardware name: LENOVO 21CQCTO1WW/21CQCTO1WW, BIOS R22ET70W (1.40 ) 03/21/2024 [ 13.067860] Call Trace: [ 13.069383] input: TPPS/2 Synaptics TrackPoint as /devices/platform/i8042/serio1/input/input8 [ 13.071486] [ 13.071492] dump_stack_lvl+0x5d/0x80 [ 13.074870] snd_hda_intel 0000:33:00.6: enabling device (0000 -> 0002) [ 13.078296] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.082199] print_report+0x174/0x505 [ 13.085776] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 13.089367] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.093255] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.097464] kasan_report+0xc8/0x150 [ 13.101461] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.105802] amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.110303] amdtp_hid_request+0xb8/0x110 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.114879] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.119450] sensor_hub_get_feature+0x1d3/0x540 [hid_sensor_hub 3f13be3016ff415bea03008d45d99da837ee3082] [ 13.124097] hid_sensor_parse_common_attributes+0x4d0/0xad0 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.127404] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.131925] ? __pfx_hid_sensor_parse_common_attributes+0x10/0x10 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.136455] ? _raw_spin_lock_irqsave+0x96/0xf0 [ 13.140197] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 13.143602] ? devm_iio_device_alloc+0x34/0x50 [industrialio 3d261d5e5765625d2b052be40e526d62b1d2123b] [ 13.147234] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.150446] ? __devm_add_action+0x167/0x1d0 [ 13.155061] hid_gyro_3d_probe+0x120/0x7f0 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.158581] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.161814] platform_probe+0xa2/0x150 [ 13.165029] really_probe+0x1e3/0x8a0 [ 13.168243] __driver_probe_device+0x18c/0x370 [ 13.171500] driver_probe_device+0x4a/0x120 [ 13.175000] __driver_attach+0x190/0x4a0 [ 13.178521] ? __pfx___driver_attach+0x10/0x10 [ 13.181771] bus_for_each_dev+0x106/0x180 [ 13.185033] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.188229] ? __pfx_bus_for_each_dev+0x10/0x10 [ 13.191446] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.194382] bus_add_driver+0x29e/0x4d0 [ 13.197328] driver_register+0x1a5/0x360 [ 13.200283] ? __pfx_hid_gyro_3d_platform_driver_init+0x10/0x10 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.203362] do_one_initcall+0xa7/0x380 [ 13.206432] ? __pfx_do_one_initcall+0x10/0x10 [ 13.210175] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.213211] ? kasan_unpoison+0x44/0x70 [ 13.216688] do_init_module+0x238/0x750 [ 13.219696] load_module+0x5011/0x6af0 [ 13.223096] ? kasan_save_stack+0x30/0x50 [ 13.226743] ? kasan_save_track+0x14/0x30 [ 13.230080] ? kasan_save_free_info+0x3b/0x60 [ 13.233323] ? poison_slab_object+0x109/0x180 [ 13.236778] ? __pfx_load_module+0x10/0x10 [ 13.239703] ? poison_slab_object+0x109/0x180 [ 13.243070] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.245924] ? init_module_from_file+0x13d/0x150 [ 13.248745] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.251503] ? init_module_from_file+0xdf/0x150 [ 13.254198] init_module_from_file+0xdf/0x150 [ 13.256826] ? __pfx_init_module_from_file+0x10/0x10 [ 13.259428] ? kasan_save_track+0x14/0x30 [ 13.261959] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.264471] ? kasan_save_free_info+0x3b/0x60 [ 13.267026] ? poison_slab_object+0x109/0x180 [ 13.269494] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.271949] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.274324] ? _raw_spin_lock+0x85/0xe0 [ 13.276671] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.278963] ? __rseq_handle_notify_resume+0x1a6/0xad0 [ 13.281193] idempotent_init_module+0x23b/0x650 [ 13.283420] ? __pfx_idempotent_init_module+0x10/0x10 [ 13.285619] ? __pfx___seccomp_filter+0x10/0x10 [ 13.287714] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.289828] ? __fget_light+0x57/0x420 [ 13.291870] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.293880] ? security_capable+0x74/0xb0 [ 13.295820] __x64_sys_finit_module+0xbe/0x130 [ 13.297874] do_syscall_64+0x82/0x190 [ 13.299898] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.301905] ? irqtime_account_irq+0x3d/0x1f0 [ 13.303877] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.305753] ? __irq_exit_rcu+0x4e/0x130 [ 13.307577] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.309489] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 13.311371] RIP: 0033:0x7a21f96ade9d [ 13.313234] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 63 de 0c 00 f7 d8 64 89 01 48 [ 13.317051] RSP: 002b:00007ffeae934e78 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 13.319024] RAX: ffffffffffffffda RBX: 00005987276bfcf0 RCX: 00007a21f96ade9d [ 13.321100] RDX: 0000000000000004 RSI: 00007a21f8eda376 RDI: 000000000000001c [ 13.323314] RBP: 00007a21f8eda376 R08: 0000000000000001 R09: 00007ffeae934ec0 [ 13.325505] R10: 0000000000000050 R11: 0000000000000246 R12: 0000000000020000 [ 13.327637] R13: 00005987276c1250 R14: 0000000000000000 R15: 00005987276c4530 [ 13.329737] [ 13.333945] Allocated by task 139: [ 13.336111] kasan_save_stack+0x30/0x50 [ 13.336121] kasan_save_track+0x14/0x30 [ 13.336125] __kasan_kmalloc+0xaa/0xb0 [ 13.336129] amdtp_hid_probe+0xb1/0x440 [amd_sfh] [ 13.336138] amd_sfh_hid_client_init+0xb8a/0x10f0 [amd_sfh] [ 13.336144] sfh_init_work+0x47/0x120 [amd_sfh] [ 13.336150] process_one_work+0x673/0xeb0 [ 13.336155] worker_thread+0x795/0x1250 [ 13.336160] kthread+0x290/0x350 [ 13.336164] ret_from_fork+0x34/0x70 [ 13.336169] ret_from_fork_asm+0x1a/0x30 [ 13.338175] Freed by task 139: [ 13.340064] kasan_save_stack+0x30/0x50 [ 13.340072] kasan_save_track+0x14/0x30 [ 13.340076] kasan_save_free_info+0x3b/0x60 [ 13.340081] poison_slab_object+0x109/0x180 [ 13.340085] __kasan_slab_free+0x32/0x50 [ 13.340089] kfree+0xe5/0x310 [ 13.340094] amdtp_hid_remove+0xb2/0x160 [amd_sfh] [ 13.340102] amd_sfh_hid_client_deinit+0x324/0x640 [amd_sfh] [ 13.340107] amd_sfh_hid_client_init+0x94a/0x10f0 [amd_sfh] [ 13.340113] sfh_init_work+0x47/0x120 [amd_sfh] [ 13.340118] process_one_work+0x673/0xeb0 [ 13.340123] worker_thread+0x795/0x1250 [ 13.340127] kthread+0x290/0x350 [ 13.340132] ret_from_fork+0x34/0x70 [ 13.340136] ret_from_fork_asm+0x1a/0x30 [ 13.342482] The buggy address belongs to the object at ffff88813152f400 which belongs to the cache kmalloc-64 of size 64 [ 13.347357] The buggy address is located 8 bytes inside of freed 64-byte region [ffff88813152f400, ffff88813152f440) [ 13.347367] The buggy address belongs to the physical page: [ 13.355409] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x13152f [ 13.355416] anon flags: 0x2ffff8000000000(node=0|zone=2|lastcpupid=0x1ffff) [ 13.355423] page_type: 0xffffefff(slab) [ 13.355429] raw: 02ffff8000000000 ffff8881000428c0 ffffea0004c43a00 0000000000000005 [ 13.355435] raw: 0000000000000000 0000000000200020 00000001ffffefff 0000000000000000 [ 13.355439] page dumped because: kasan: bad access detected [ 13.357295] Memory state around the buggy address: [ 13.357299] ffff88813152f300: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 13.357303] ffff88813152f380: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 13.357306] >ffff88813152f400: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 13.357309] ^ [ 13.357311] ffff88813152f480: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc [ 13.357315] ffff88813152f500: 00 00 00 00 00 00 00 06 fc fc fc fc fc fc fc fc [ 13.357318] ================================================================== [ 13.357405] Disabling lock debugging due to kernel taint [ 13.383534] Oops: general protection fault, probably for non-canonical address 0xe0a1bc4140000013: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 13.383544] KASAN: maybe wild-memory-access in range [0x050e020a00000098-0x050e020a0000009f] [ 13.383551] CPU: 3 PID: 479 Comm: (udev-worker) Tainted: G B 6.10.0-arch1-2 #1 893bb55d7f0073f25c46adbb49eb3785fefd74b0 [ 13.383561] Hardware name: LENOVO 21CQCTO1WW/21CQCTO1WW, BIOS R22ET70W (1.40 ) 03/21/2024 [ 13.383565] RIP: 0010:amd_sfh_get_report+0x81/0x530 [amd_sfh] [ 13.383580] Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 78 03 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b 63 08 49 8d 7c 24 10 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 1a 03 00 00 45 8b 74 24 10 45 [ 13.383585] RSP: 0018:ffff8881261f7388 EFLAGS: 00010212 [ 13.383592] RAX: dffffc0000000000 RBX: ffff88813152f400 RCX: 0000000000000002 [ 13.383597] RDX: 00a1c04140000013 RSI: 0000000000000008 RDI: 050e020a0000009b [ 13.383600] RBP: ffff88814d010000 R08: 0000000000000002 R09: fffffbfff3ddb8c0 [ 13.383604] R10: ffffffff9eedc607 R11: ffff88810ce98000 R12: 050e020a0000008b [ 13.383607] R13: ffff88814d010000 R14: dffffc0000000000 R15: 0000000000000004 [ 13.383611] FS: 00007a21f94d0880(0000) GS:ffff8887e7d80000(0000) knlGS:0000000000000000 [ 13.383615] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 13.383618] CR2: 00007e0014c438f0 CR3: 000000012614c000 CR4: 0000000000f50ef0 [ 13.383622] PKRU: 55555554 [ 13.383625] Call Trace: [ 13.383629] [ 13.383632] ? __die_body.cold+0x19/0x27 [ 13.383644] ? die_addr+0x46/0x70 [ 13.383652] ? exc_general_protection+0x150/0x240 [ 13.383664] ? asm_exc_general_protection+0x26/0x30 [ 13.383674] ? amd_sfh_get_report+0x81/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.383686] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.383697] amdtp_hid_request+0xb8/0x110 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.383706] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383713] sensor_hub_get_feature+0x1d3/0x540 [hid_sensor_hub 3f13be3016ff415bea03008d45d99da837ee3082] [ 13.383727] hid_sensor_parse_common_attributes+0x4d0/0xad0 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.383739] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383745] ? __pfx_hid_sensor_parse_common_attributes+0x10/0x10 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.383753] ? _raw_spin_lock_irqsave+0x96/0xf0 [ 13.383762] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 13.383768] ? devm_iio_device_alloc+0x34/0x50 [industrialio 3d261d5e5765625d2b052be40e526d62b1d2123b] [ 13.383790] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383795] ? __devm_add_action+0x167/0x1d0 [ 13.383806] hid_gyro_3d_probe+0x120/0x7f0 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.383818] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383826] platform_probe+0xa2/0x150 [ 13.383832] really_probe+0x1e3/0x8a0 [ 13.383838] __driver_probe_device+0x18c/0x370 [ 13.383844] driver_probe_device+0x4a/0x120 [ 13.383851] __driver_attach+0x190/0x4a0 [ 13.383857] ? __pfx___driver_attach+0x10/0x10 [ 13.383863] bus_for_each_dev+0x106/0x180 [ 13.383868] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.383874] ? __pfx_bus_for_each_dev+0x10/0x10 [ 13.383880] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383887] bus_add_driver+0x29e/0x4d0 [ 13.383895] driver_register+0x1a5/0x360 [ 13.383902] ? __pfx_hid_gyro_3d_platform_driver_init+0x10/0x10 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.383910] do_one_initcall+0xa7/0x380 [ 13.383919] ? __pfx_do_one_initcall+0x10/0x10 [ 13.383927] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383933] ? kasan_unpoison+0x44/0x70 [ 13.383943] do_init_module+0x238/0x750 [ 13.383955] load_module+0x5011/0x6af0 [ 13.383962] ? kasan_save_stack+0x30/0x50 [ 13.383968] ? kasan_save_track+0x14/0x30 [ 13.383973] ? kasan_save_free_info+0x3b/0x60 [ 13.383980] ? poison_slab_object+0x109/0x180 [ 13.383993] ? __pfx_load_module+0x10/0x10 [ 13.384007] ? poison_slab_object+0x109/0x180 [ 13.384012] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384018] ? init_module_from_file+0x13d/0x150 [ 13.384025] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384032] ? init_module_from_file+0xdf/0x150 [ 13.384037] init_module_from_file+0xdf/0x150 [ 13.384044] ? __pfx_init_module_from_file+0x10/0x10 [ 13.384050] ? kasan_save_track+0x14/0x30 [ 13.384055] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384060] ? kasan_save_free_info+0x3b/0x60 [ 13.384066] ? poison_slab_object+0x109/0x180 [ 13.384071] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384080] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384085] ? _raw_spin_lock+0x85/0xe0 [ 13.384091] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.384096] ? __rseq_handle_notify_resume+0x1a6/0xad0 [ 13.384106] idempotent_init_module+0x23b/0x650 [ 13.384114] ? __pfx_idempotent_init_module+0x10/0x10 [ 13.384120] ? __pfx___seccomp_filter+0x10/0x10 [ 13.384129] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384135] ? __fget_light+0x57/0x420 [ 13.384142] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384147] ? security_capable+0x74/0xb0 [ 13.384157] __x64_sys_finit_module+0xbe/0x130 [ 13.384164] do_syscall_64+0x82/0x190 [ 13.384174] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384179] ? irqtime_account_irq+0x3d/0x1f0 [ 13.384188] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384193] ? __irq_exit_rcu+0x4e/0x130 [ 13.384201] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384206] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 13.384212] RIP: 0033:0x7a21f96ade9d [ 13.384263] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 63 de 0c 00 f7 d8 64 89 01 48 [ 13.384267] RSP: 002b:00007ffeae934e78 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 13.384273] RAX: ffffffffffffffda RBX: 00005987276bfcf0 RCX: 00007a21f96ade9d [ 13.384277] RDX: 0000000000000004 RSI: 00007a21f8eda376 RDI: 000000000000001c [ 13.384280] RBP: 00007a21f8eda376 R08: 0000000000000001 R09: 00007ffeae934ec0 [ 13.384284] R10: 0000000000000050 R11: 0000000000000246 R12: 0000000000020000 [ 13.384288] R13: 00005987276c1250 R14: 0000000000000000 R15: 00005987276c4530 [ 13.384297] [ 13.384299] Modules linked in: soundwire_amd(+) hid_sensor_gyro_3d(+) hid_sensor_magn_3d hid_sensor_accel_3d soundwire_generic_allocation amdxcp hid_sensor_trigger drm_exec industrialio_triggered_buffer soundwire_bus gpu_sched kvm_amd kfifo_buf qmi_helpers joydev drm_buddy hid_sensor_iio_common mousedev snd_soc_core industrialio i2c_algo_bit mac80211 snd_compress drm_suballoc_helper kvm snd_hda_intel drm_ttm_helper ac97_bus snd_pcm_dmaengine snd_intel_dspcfg ttm thinkpad_acpi(+) snd_intel_sdw_acpi hid_sensor_hub snd_rpl_pci_acp6x drm_display_helper snd_hda_codec hid_multitouch libarc4 snd_acp_pci platform_profile think_lmi(+) hid_generic firmware_attributes_class wmi_bmof cec snd_acp_legacy_common sparse_keymap rapl snd_hda_core psmouse cfg80211 pcspkr snd_pci_acp6x snd_hwdep video snd_pcm snd_pci_acp5x snd_timer snd_rn_pci_acp3x ucsi_acpi snd_acp_config snd sp5100_tco rfkill snd_soc_acpi typec_ucsi thunderbolt amd_sfh k10temp mhi soundcore i2c_piix4 snd_pci_acp3x typec i2c_hid_acpi roles i2c_hid wmi acpi_tad amd_pmc [ 13.384454] mac_hid i2c_dev crypto_user loop nfnetlink zram ip_tables x_tables dm_crypt cbc encrypted_keys trusted asn1_encoder tee dm_mod crct10dif_pclmul crc32_pclmul polyval_clmulni polyval_generic gf128mul ghash_clmulni_intel serio_raw sha512_ssse3 atkbd sha256_ssse3 libps2 sha1_ssse3 vivaldi_fmap nvme aesni_intel crypto_simd nvme_core cryptd ccp xhci_pci i8042 nvme_auth xhci_pci_renesas serio vfat fat btrfs blake2b_generic libcrc32c crc32c_generic crc32c_intel xor raid6_pq [ 13.384552] ---[ end trace 0000000000000000 ]--- KASAN reports a use-after-free of hid->driver_data in function amd_sfh_get_report(). The backtrace indicates that the function is called by amdtp_hid_request() which is one of the callbacks of hid device. The current make sure that driver_data is freed only once hid_destroy_device() returned. Note that I observed the crash both on v6.9.9 and v6.10.0. The code seems to be as it was from the early days of the driver. Signed-off-by: Olivier Sobrie Acked-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/amd_sfh_hid.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_hid.c b/drivers/hid/amd-sfh-hid/amd_sfh_hid.c index 705b52337068..81f3024b7b1b 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_hid.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_hid.c @@ -171,11 +171,13 @@ int amdtp_hid_probe(u32 cur_hid_dev, struct amdtp_cl_data *cli_data) void amdtp_hid_remove(struct amdtp_cl_data *cli_data) { int i; + struct amdtp_hid_data *hid_data; for (i = 0; i < cli_data->num_hid_devices; ++i) { if (cli_data->hid_sensor_hubs[i]) { - kfree(cli_data->hid_sensor_hubs[i]->driver_data); + hid_data = cli_data->hid_sensor_hubs[i]->driver_data; hid_destroy_device(cli_data->hid_sensor_hubs[i]); + kfree(hid_data); cli_data->hid_sensor_hubs[i] = NULL; } } From c8000deb68365b461b324d68c7ea89d730f0bb85 Mon Sep 17 00:00:00 2001 From: Dmitry Savin Date: Tue, 16 Jul 2024 23:27:57 +0100 Subject: [PATCH 0457/1052] HID: multitouch: Add support for GT7868Q GT7868Q has incorrect data in the report and needs a fixup. The change enables haptic touchpad on Lenovo ThinkBook 13x Gen 4 and has been tested on the device. Signed-off-by: Dmitry Savin Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 2 ++ drivers/hid/hid-multitouch.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 6e3223389080..781c5aa29859 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -521,6 +521,8 @@ #define USB_DEVICE_ID_GENERAL_TOUCH_WIN8_PIT_E100 0xe100 #define I2C_VENDOR_ID_GOODIX 0x27c6 +#define I2C_DEVICE_ID_GOODIX_01E8 0x01e8 +#define I2C_DEVICE_ID_GOODIX_01E9 0x01e9 #define I2C_DEVICE_ID_GOODIX_01F0 0x01f0 #define USB_VENDOR_ID_GOODTOUCH 0x1aad diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 56fc78841f24..99812c0f830b 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1441,6 +1441,30 @@ static int mt_event(struct hid_device *hid, struct hid_field *field, return 0; } +static __u8 *mt_report_fixup(struct hid_device *hdev, __u8 *rdesc, + unsigned int *size) +{ + if (hdev->vendor == I2C_VENDOR_ID_GOODIX && + (hdev->product == I2C_DEVICE_ID_GOODIX_01E8 || + hdev->product == I2C_DEVICE_ID_GOODIX_01E9)) { + if (rdesc[607] == 0x15) { + rdesc[607] = 0x25; + dev_info( + &hdev->dev, + "GT7868Q report descriptor fixup is applied.\n"); + } else { + dev_info( + &hdev->dev, + "The byte is not expected for fixing the report descriptor. \ +It's possible that the touchpad firmware is not suitable for applying the fix. \ +got: %x\n", + rdesc[607]); + } + } + + return rdesc; +} + static void mt_report(struct hid_device *hid, struct hid_report *report) { struct mt_device *td = hid_get_drvdata(hid); @@ -2035,6 +2059,14 @@ static const struct hid_device_id mt_devices[] = { MT_BT_DEVICE(USB_VENDOR_ID_FRUCTEL, USB_DEVICE_ID_GAMETEL_MT_MODE) }, + /* Goodix GT7868Q devices */ + { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT_NSMU, + HID_DEVICE(BUS_I2C, HID_GROUP_ANY, I2C_VENDOR_ID_GOODIX, + I2C_DEVICE_ID_GOODIX_01E8) }, + { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT_NSMU, + HID_DEVICE(BUS_I2C, HID_GROUP_ANY, I2C_VENDOR_ID_GOODIX, + I2C_DEVICE_ID_GOODIX_01E8) }, + /* GoodTouch panels */ { .driver_data = MT_CLS_NSMU, MT_USB_DEVICE(USB_VENDOR_ID_GOODTOUCH, @@ -2270,6 +2302,7 @@ static struct hid_driver mt_driver = { .feature_mapping = mt_feature_mapping, .usage_table = mt_grabbed_usages, .event = mt_event, + .report_fixup = mt_report_fixup, .report = mt_report, .suspend = pm_ptr(mt_suspend), .reset_resume = pm_ptr(mt_reset_resume), From 1b8f9c1fb464968a5b18d3acc1da8c00bad24fad Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Tue, 30 Jul 2024 08:51:55 -0700 Subject: [PATCH 0458/1052] HID: wacom: Defer calculation of resolution until resolution_code is known The Wacom driver maps the HID_DG_TWIST usage to ABS_Z (rather than ABS_RZ) for historic reasons. When the code to support twist was introduced in commit 50066a042da5 ("HID: wacom: generic: Add support for height, tilt, and twist usages"), we were careful to write it in such a way that it had HID calculate the resolution of the twist axis assuming ABS_RZ instead (so that we would get correct angular behavior). This was broken with the introduction of commit 08a46b4190d3 ("HID: wacom: Set a default resolution for older tablets"), which moved the resolution calculation to occur *before* the adjustment from ABS_Z to ABS_RZ occurred. This commit moves the calculation of resolution after the point that we are finished setting things up for its proper use. Signed-off-by: Jason Gerecke Fixes: 08a46b4190d3 ("HID: wacom: Set a default resolution for older tablets") Cc: stable@vger.kernel.org Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 1f4564982b95..2541fa2e0fa3 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1878,12 +1878,14 @@ static void wacom_map_usage(struct input_dev *input, struct hid_usage *usage, int fmax = field->logical_maximum; unsigned int equivalent_usage = wacom_equivalent_usage(usage->hid); int resolution_code = code; - int resolution = hidinput_calc_abs_res(field, resolution_code); + int resolution; if (equivalent_usage == HID_DG_TWIST) { resolution_code = ABS_RZ; } + resolution = hidinput_calc_abs_res(field, resolution_code); + if (equivalent_usage == HID_GD_X) { fmin += features->offset_left; fmax -= features->offset_right; From a09074228977c24c677c10282f506fa11f88eb93 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 Aug 2024 18:39:28 +0200 Subject: [PATCH 0459/1052] thermal: core: Update thermal zone registration documentation The thermal sysfs API document is outdated. One of the problems with it is that is still documents thermal_zone_device_register() which does not exit any more and it does not reflect the current thermal zone operations definition. Replace the thermal_zone_device_register() description in it with a thermal_zone_device_register_with_trips() description, including an update of the thermal zone operations list. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/2767845.mvXUDI8C0e@rjwysocki.net --- .../driver-api/thermal/sysfs-api.rst | 65 +++++++++---------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst index 6c1175c6afba..978198f8a18b 100644 --- a/Documentation/driver-api/thermal/sysfs-api.rst +++ b/Documentation/driver-api/thermal/sysfs-api.rst @@ -4,8 +4,6 @@ Generic Thermal Sysfs driver How To Written by Sujith Thomas , Zhang Rui -Updated: 2 January 2008 - Copyright (c) 2008 Intel Corporation @@ -38,23 +36,23 @@ temperature) and throttle appropriate devices. :: - struct thermal_zone_device - *thermal_zone_device_register(char *type, - int trips, int mask, void *devdata, - struct thermal_zone_device_ops *ops, - const struct thermal_zone_params *tzp, - int passive_delay, int polling_delay)) + struct thermal_zone_device * + thermal_zone_device_register_with_trips(const char *type, + const struct thermal_trip *trips, + int num_trips, void *devdata, + const struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + unsigned int passive_delay, + unsigned int polling_delay) - This interface function adds a new thermal zone device (sensor) to + This interface function adds a new thermal zone device (sensor) to the /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the - thermal cooling devices registered at the same time. + thermal cooling devices registered to it at the same time. type: the thermal zone type. trips: - the total number of trip points this thermal zone supports. - mask: - Bit string: If 'n'th bit is set, then trip point 'n' is writable. + the table of trip points for this thermal zone. devdata: device private data ops: @@ -67,32 +65,29 @@ temperature) and throttle appropriate devices. .get_temp: get the current temperature of the thermal zone. .set_trips: - set the trip points window. Whenever the current temperature - is updated, the trip points immediately below and above the - current temperature are found. - .get_mode: - get the current mode (enabled/disabled) of the thermal zone. - - - "enabled" means the kernel thermal management is - enabled. - - "disabled" will prevent kernel thermal driver action - upon trip points so that user applications can take - charge of thermal management. - .set_mode: - set the mode (enabled/disabled) of the thermal zone. - .get_trip_type: - get the type of certain trip point. - .get_trip_temp: - get the temperature above which the certain trip point - will be fired. + set the trip points window. Whenever the current temperature + is updated, the trip points immediately below and above the + current temperature are found. + .change_mode: + change the mode (enabled/disabled) of the thermal zone. + .set_trip_temp: + set the temperature of a given trip point. + .get_crit_temp: + get the critical temperature for this thermal zone. .set_emul_temp: - set the emulation temperature which helps in debugging - different threshold temperature points. + set the emulation temperature which helps in debugging + different threshold temperature points. + .get_trend: + get the trend of most recent zone temperature changes. + .hot: + hot trip point crossing handler. + .critical: + critical trip point crossing handler. tzp: thermal zone platform parameters. passive_delay: - number of milliseconds to wait between polls when - performing passive cooling. + number of milliseconds to wait between polls when performing passive + cooling. polling_delay: number of milliseconds to wait between polls when checking whether trip points have been crossed (0 for interrupt driven systems). From 64a66f4a3c89b4602ee1e6cd23b28729fc4562b3 Mon Sep 17 00:00:00 2001 From: Pedro Henrique Kopper Date: Thu, 1 Aug 2024 13:41:50 -0300 Subject: [PATCH 0460/1052] cpufreq: intel_pstate: Update Balance performance EPP for Emerald Rapids On Intel Emerald Rapids machines, we ship the Energy Performance Preference (EPP) default for balance_performance as 128. However, during an internal investigation together with Intel, we have determined that 32 is a more suitable value. This leads to significant improvements in both performance and energy: POV-Ray: 32% faster | 12% less energy OpenSSL: 12% faster | energy within 1% Build Linux Kernel: 29% faster | 18% less energy Therefore, we should move the default EPP for balance_performance to 32. This is in line with what has already been done for Sapphire Rapids. Signed-off-by: Pedro Henrique Kopper Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/Zqu6zjVMoiXwROBI@capivara Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 392a8000b238..c0278d023cfc 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3405,6 +3405,7 @@ static const struct x86_cpu_id intel_epp_default[] = { */ X86_MATCH_VFM(INTEL_ALDERLAKE_L, HWP_SET_DEF_BALANCE_PERF_EPP(102)), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)), X86_MATCH_VFM(INTEL_METEORLAKE_L, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE, 179, 64, 16)), X86_MATCH_VFM(INTEL_ARROWLAKE, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE, From becfa08bfefa2cbb22c84d9e583e81387f2f3bf2 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 2 Aug 2024 11:57:31 +0100 Subject: [PATCH 0461/1052] ASoC: cs42l43: Remove redundant semi-colon at end of function Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20240802105734.2309788-2-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index 92674314227c..80825777048a 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -608,7 +608,7 @@ static int cs42l43_sdw_hw_params(struct snd_pcm_substream *substream, return ret; return cs42l43_set_sample_rate(substream, params, dai); -}; +} static const struct snd_soc_dai_ops cs42l43_sdw_ops = { .startup = cs42l43_startup, From c8a132e2e032b00828d51141ab34f9aeb24f44ae Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 2 Aug 2024 11:57:32 +0100 Subject: [PATCH 0462/1052] ASoC: soc-component: Add new snd_soc_component_get_kcontrol() helpers Add new helper functions snd_soc_component_get_kcontrol() and snd_soc_component_get_kcontrol_locked() that returns a kcontrol by name, but will factor in the components name_prefix, to handle situations where multiple components are present with the same controls. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20240802105734.2309788-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 5 +++++ sound/soc/soc-component.c | 42 ++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index ceca69b46a82..bf2e381cd124 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -462,6 +462,11 @@ int snd_soc_component_force_enable_pin_unlocked( const char *pin); /* component controls */ +struct snd_kcontrol *snd_soc_component_get_kcontrol(struct snd_soc_component *component, + const char * const ctl); +struct snd_kcontrol * +snd_soc_component_get_kcontrol_locked(struct snd_soc_component *component, + const char * const ctl); int snd_soc_component_notify_control(struct snd_soc_component *component, const char * const ctl); diff --git a/sound/soc/soc-component.c b/sound/soc/soc-component.c index 4d7c2e3c929a..42f481321919 100644 --- a/sound/soc/soc-component.c +++ b/sound/soc/soc-component.c @@ -236,19 +236,45 @@ int snd_soc_component_force_enable_pin_unlocked( } EXPORT_SYMBOL_GPL(snd_soc_component_force_enable_pin_unlocked); +static void soc_get_kcontrol_name(struct snd_soc_component *component, + char *buf, int size, const char * const ctl) +{ + /* When updating, change also snd_soc_dapm_widget_name_cmp() */ + if (component->name_prefix) + snprintf(buf, size, "%s %s", component->name_prefix, ctl); + else + snprintf(buf, size, "%s", ctl); +} + +struct snd_kcontrol *snd_soc_component_get_kcontrol(struct snd_soc_component *component, + const char * const ctl) +{ + char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; + + soc_get_kcontrol_name(component, name, ARRAY_SIZE(name), ctl); + + return snd_soc_card_get_kcontrol(component->card, name); +} +EXPORT_SYMBOL_GPL(snd_soc_component_get_kcontrol); + +struct snd_kcontrol * +snd_soc_component_get_kcontrol_locked(struct snd_soc_component *component, + const char * const ctl) +{ + char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; + + soc_get_kcontrol_name(component, name, ARRAY_SIZE(name), ctl); + + return snd_soc_card_get_kcontrol_locked(component->card, name); +} +EXPORT_SYMBOL_GPL(snd_soc_component_get_kcontrol_locked); + int snd_soc_component_notify_control(struct snd_soc_component *component, const char * const ctl) { - char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; struct snd_kcontrol *kctl; - /* When updating, change also snd_soc_dapm_widget_name_cmp() */ - if (component->name_prefix) - snprintf(name, ARRAY_SIZE(name), "%s %s", component->name_prefix, ctl); - else - snprintf(name, ARRAY_SIZE(name), "%s", ctl); - - kctl = snd_soc_card_get_kcontrol(component->card, name); + kctl = snd_soc_component_get_kcontrol(component, ctl); if (!kctl) return soc_component_ret(component, -EINVAL); From 4791c422981350d0de4ad02a14a08b99c766d06f Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 2 Aug 2024 11:57:33 +0100 Subject: [PATCH 0463/1052] ASoC: cs35l45: Use new snd_soc_component_get_kcontrol_locked() helper No longer any need to hard code the addition of the name prefix, use the new helper function. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20240802105734.2309788-4-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l45.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sound/soc/codecs/cs35l45.c b/sound/soc/codecs/cs35l45.c index 2392c6effed8..1e9d73bee3b4 100644 --- a/sound/soc/codecs/cs35l45.c +++ b/sound/soc/codecs/cs35l45.c @@ -176,17 +176,10 @@ static int cs35l45_activate_ctl(struct snd_soc_component *component, struct snd_kcontrol *kcontrol; struct snd_kcontrol_volatile *vd; unsigned int index_offset; - char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; - if (component->name_prefix) - snprintf(name, SNDRV_CTL_ELEM_ID_NAME_MAXLEN, "%s %s", - component->name_prefix, ctl_name); - else - snprintf(name, SNDRV_CTL_ELEM_ID_NAME_MAXLEN, "%s", ctl_name); - - kcontrol = snd_soc_card_get_kcontrol_locked(component->card, name); + kcontrol = snd_soc_component_get_kcontrol_locked(component, ctl_name); if (!kcontrol) { - dev_err(component->dev, "Can't find kcontrol %s\n", name); + dev_err(component->dev, "Can't find kcontrol %s\n", ctl_name); return -EINVAL; } From 93afd028fb5f06a46a32375fd1f0473451eb1c5a Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 2 Aug 2024 11:57:34 +0100 Subject: [PATCH 0464/1052] ASoC: cs42l43: Cache shutter IRQ control pointers The microphone/speaker privacy shutter ALSA control handlers need to call pm_runtime_resume, since the hardware needs to be powered up to check the hardware state of the shutter. The IRQ handler for the shutters also needs to notify the ALSA control to inform user-space the shutters updated. However this leads to a mutex inversion, between the sdw_dev_lock and the controls_rwsem. To avoid this mutex inversion cache the kctl pointers before the IRQ handler, which avoids the need to lookup the control and take the controls_rwsem. Suggested-by: Jaroslav Kysela Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20240802105734.2309788-5-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43.c | 73 +++++++++++++++++++++++++++++--------- sound/soc/codecs/cs42l43.h | 2 ++ 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index 80825777048a..5183b4586424 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -252,24 +253,20 @@ CS42L43_IRQ_COMPLETE(load_detect) static irqreturn_t cs42l43_mic_shutter(int irq, void *data) { struct cs42l43_codec *priv = data; - static const char * const controls[] = { - "Decimator 1 Switch", - "Decimator 2 Switch", - "Decimator 3 Switch", - "Decimator 4 Switch", - }; - int i, ret; + struct snd_soc_component *component = priv->component; + int i; dev_dbg(priv->dev, "Microphone shutter changed\n"); - if (!priv->component) + if (!component) return IRQ_NONE; - for (i = 0; i < ARRAY_SIZE(controls); i++) { - ret = snd_soc_component_notify_control(priv->component, - controls[i]); - if (ret) + for (i = 1; i < ARRAY_SIZE(priv->kctl); i++) { + if (!priv->kctl[i]) return IRQ_NONE; + + snd_ctl_notify(component->card->snd_card, + SNDRV_CTL_EVENT_MASK_VALUE, &priv->kctl[i]->id); } return IRQ_HANDLED; @@ -278,18 +275,19 @@ static irqreturn_t cs42l43_mic_shutter(int irq, void *data) static irqreturn_t cs42l43_spk_shutter(int irq, void *data) { struct cs42l43_codec *priv = data; - int ret; + struct snd_soc_component *component = priv->component; dev_dbg(priv->dev, "Speaker shutter changed\n"); - if (!priv->component) + if (!component) return IRQ_NONE; - ret = snd_soc_component_notify_control(priv->component, - "Speaker Digital Switch"); - if (ret) + if (!priv->kctl[0]) return IRQ_NONE; + snd_ctl_notify(component->card->snd_card, + SNDRV_CTL_EVENT_MASK_VALUE, &priv->kctl[0]->id); + return IRQ_HANDLED; } @@ -590,7 +588,46 @@ static int cs42l43_asp_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mas return 0; } +static int cs42l43_dai_probe(struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct cs42l43_codec *priv = snd_soc_component_get_drvdata(component); + static const char * const controls[] = { + "Speaker Digital Switch", + "Decimator 1 Switch", + "Decimator 2 Switch", + "Decimator 3 Switch", + "Decimator 4 Switch", + }; + int i; + + static_assert(ARRAY_SIZE(controls) == ARRAY_SIZE(priv->kctl)); + + for (i = 0; i < ARRAY_SIZE(controls); i++) { + if (priv->kctl[i]) + continue; + + priv->kctl[i] = snd_soc_component_get_kcontrol(component, controls[i]); + } + + return 0; +} + +static int cs42l43_dai_remove(struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct cs42l43_codec *priv = snd_soc_component_get_drvdata(component); + int i; + + for (i = 0; i < ARRAY_SIZE(priv->kctl); i++) + priv->kctl[i] = NULL; + + return 0; +} + static const struct snd_soc_dai_ops cs42l43_asp_ops = { + .probe = cs42l43_dai_probe, + .remove = cs42l43_dai_remove, .startup = cs42l43_startup, .hw_params = cs42l43_asp_hw_params, .set_fmt = cs42l43_asp_set_fmt, @@ -611,6 +648,8 @@ static int cs42l43_sdw_hw_params(struct snd_pcm_substream *substream, } static const struct snd_soc_dai_ops cs42l43_sdw_ops = { + .probe = cs42l43_dai_probe, + .remove = cs42l43_dai_remove, .startup = cs42l43_startup, .set_stream = cs42l43_sdw_set_stream, .hw_params = cs42l43_sdw_hw_params, diff --git a/sound/soc/codecs/cs42l43.h b/sound/soc/codecs/cs42l43.h index 9924c13e1eb5..9c144e129535 100644 --- a/sound/soc/codecs/cs42l43.h +++ b/sound/soc/codecs/cs42l43.h @@ -100,6 +100,8 @@ struct cs42l43_codec { struct delayed_work hp_ilimit_clear_work; bool hp_ilimited; int hp_ilimit_count; + + struct snd_kcontrol *kctl[5]; }; #if IS_REACHABLE(CONFIG_SND_SOC_CS42L43_SDW) From 54233a4254036efca91b9bffbd398ecf39e90555 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 30 Jul 2024 17:30:40 +0200 Subject: [PATCH 0465/1052] uretprobe: change syscall number, again Despite multiple attempts to get the syscall number assignment right for the newly added uretprobe syscall, we ended up with a bit of a mess: - The number is defined as 467 based on the assumption that the xattrat family of syscalls would use 463 through 466, but those did not make it into 6.11. - The include/uapi/asm-generic/unistd.h file still lists the number 463, but the new scripts/syscall.tbl that was supposed to have the same data lists 467 instead as the number for arc, arm64, csky, hexagon, loongarch, nios2, openrisc and riscv. None of these architectures actually provide a uretprobe syscall. - All the other architectures (powerpc, arm, mips, ...) don't list this syscall at all. There are two ways to make it consistent again: either list it with the same syscall number on all architectures, or only list it on x86 but not in scripts/syscall.tbl and asm-generic/unistd.h. Based on the most recent discussion, it seems like we won't need it anywhere else, so just remove the inconsistent assignment and instead move the x86 number to the next available one in the architecture specific range, which is 335. Fixes: 5c28424e9a34 ("syscalls: Fix to add sys_uretprobe to syscall.tbl") Fixes: 190fec72df4a ("uprobe: Wire up uretprobe system call") Fixes: 63ded110979b ("uprobe: Change uretprobe syscall scope and number") Acked-by: Masami Hiramatsu (Google) Reviewed-by: Jiri Olsa Signed-off-by: Arnd Bergmann --- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- include/uapi/asm-generic/unistd.h | 5 +---- scripts/syscall.tbl | 1 - tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 2 +- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 83073fa3c989..7093ee21c0d1 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -344,6 +344,7 @@ 332 common statx sys_statx 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq +335 common uretprobe sys_uretprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal @@ -385,7 +386,6 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal -467 common uretprobe sys_uretprobe # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 985a262d0f9e..5bf6148cac2b 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -841,11 +841,8 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) #define __NR_mseal 462 __SYSCALL(__NR_mseal, sys_mseal) -#define __NR_uretprobe 463 -__SYSCALL(__NR_uretprobe, sys_uretprobe) - #undef __NR_syscalls -#define __NR_syscalls 464 +#define __NR_syscalls 463 /* * 32 bit systems traditionally used different diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index 591d85e8ca7e..797e20ea99a2 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -402,4 +402,3 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal -467 common uretprobe sys_uretprobe diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index bd8c75b620c2..5f78edca6540 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -216,7 +216,7 @@ static void test_uretprobe_regs_change(void) } #ifndef __NR_uretprobe -#define __NR_uretprobe 467 +#define __NR_uretprobe 335 #endif __naked unsigned long uretprobe_syscall_call_1(void) From 343416f0c11c42bed07f6db03ca599f4f1771b17 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 1 Aug 2024 14:27:23 +0200 Subject: [PATCH 0466/1052] syscalls: fix syscall macros for newfstat/newfstatat The __NR_newfstat and __NR_newfstatat macros accidentally got renamed in the conversion to the syscall.tbl format, dropping the 'new' portion of the name. In an unrelated change, the two syscalls are no longer architecture specific but are once more defined on all 64-bit architectures, so the 'newstat' ABI keyword can be dropped from the table as a simplification. Fixes: Fixes: 4fe53bf2ba0a ("syscalls: add generic scripts/syscall.tbl") Closes: https://lore.kernel.org/lkml/838053e0-b186-4e9f-9668-9a3384a71f23@app.fastmail.com/T/#t Reported-by: Florian Weimer Signed-off-by: Arnd Bergmann --- arch/arm64/kernel/Makefile.syscalls | 2 +- arch/loongarch/kernel/Makefile.syscalls | 3 ++- arch/riscv/kernel/Makefile.syscalls | 2 +- scripts/syscall.tbl | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/Makefile.syscalls b/arch/arm64/kernel/Makefile.syscalls index 3cfafd003b2d..0542a718871a 100644 --- a/arch/arm64/kernel/Makefile.syscalls +++ b/arch/arm64/kernel/Makefile.syscalls @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 syscall_abis_32 += -syscall_abis_64 += renameat newstat rlimit memfd_secret +syscall_abis_64 += renameat rlimit memfd_secret syscalltbl = arch/arm64/tools/syscall_%.tbl diff --git a/arch/loongarch/kernel/Makefile.syscalls b/arch/loongarch/kernel/Makefile.syscalls index 523bb411a3bc..ab7d9baa2915 100644 --- a/arch/loongarch/kernel/Makefile.syscalls +++ b/arch/loongarch/kernel/Makefile.syscalls @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -syscall_abis_64 += newstat +# No special ABIs on loongarch so far +syscall_abis_64 += diff --git a/arch/riscv/kernel/Makefile.syscalls b/arch/riscv/kernel/Makefile.syscalls index 52087a023b3d..9668fd1faf60 100644 --- a/arch/riscv/kernel/Makefile.syscalls +++ b/arch/riscv/kernel/Makefile.syscalls @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 syscall_abis_32 += riscv memfd_secret -syscall_abis_64 += riscv newstat rlimit memfd_secret +syscall_abis_64 += riscv rlimit memfd_secret diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index 797e20ea99a2..4586a18dfe9b 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -98,9 +98,9 @@ 77 common tee sys_tee 78 common readlinkat sys_readlinkat 79 stat64 fstatat64 sys_fstatat64 -79 newstat fstatat sys_newfstatat +79 64 newfstatat sys_newfstatat 80 stat64 fstat64 sys_fstat64 -80 newstat fstat sys_newfstat +80 64 newfstat sys_newfstat 81 common sync sys_sync 82 common fsync sys_fsync 83 common fdatasync sys_fdatasync From fca5b78511e98bdff2cdd55c172b23200a7b3404 Mon Sep 17 00:00:00 2001 From: Barak Biber Date: Thu, 1 Aug 2024 09:26:04 -0300 Subject: [PATCH 0467/1052] iommu: Restore lost return in iommu_report_device_fault() When iommu_report_device_fault gets called with a partial fault it is supposed to collect the fault into the group and then return. Instead the return was accidently deleted which results in trying to process the fault and an eventual crash. Deleting the return was a typo, put it back. Fixes: 3dfa64aecbaf ("iommu: Make iommu_report_device_fault() return void") Signed-off-by: Barak Biber Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/0-v1-e7153d9c8cee+1c6-iommu_fault_fix_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgfault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index cd679c13752e..81e9cc6e3164 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -170,6 +170,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) report_partial_fault(iopf_param, fault); iopf_put_dev_fault_param(iopf_param); /* A request that is not the last does not need to be ack'd */ + return; } /* From cfb00a35786414e7c0e6226b277d9f09657eae74 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 31 Jul 2024 14:36:01 +0100 Subject: [PATCH 0468/1052] arm64: jump_label: Ensure patched jump_labels are visible to all CPUs Although the Arm architecture permits concurrent modification and execution of NOP and branch instructions, it still requires some synchronisation to ensure that other CPUs consistently execute the newly written instruction: > When the modified instructions are observable, each PE that is > executing the modified instructions must execute an ISB or perform a > context synchronizing event to ensure execution of the modified > instructions Prior to commit f6cc0c501649 ("arm64: Avoid calling stop_machine() when patching jump labels"), the arm64 jump_label patching machinery performed synchronisation using stop_machine() after each modification, however this was problematic when flipping static keys from atomic contexts (namely, the arm_arch_timer CPU hotplug startup notifier) and so we switched to the _nosync() patching routines to avoid "scheduling while atomic" BUG()s during boot. In hindsight, the analysis of the issue in f6cc0c501649 isn't quite right: it cites the use of IPIs in the default patching routines as the cause of the lockup, whereas stop_machine() does not rely on IPIs and the I-cache invalidation is performed using __flush_icache_range(), which elides the call to kick_all_cpus_sync(). In fact, the blocking wait for other CPUs is what triggers the BUG() and the problem remains even after f6cc0c501649, for example because we could block on the jump_label_mutex. Eventually, the arm_arch_timer driver was fixed to avoid the static key entirely in commit a862fc2254bd ("clocksource/arm_arch_timer: Remove use of workaround static key"). This all leaves the jump_label patching code in a funny situation on arm64 as we do not synchronise with other CPUs to reduce the likelihood of a bug which no longer exists. Consequently, toggling a static key on one CPU cannot be assumed to take effect on other CPUs, leading to potential issues, for example with missing preempt notifiers. Rather than revert f6cc0c501649 and go back to stop_machine() for each patch site, implement arch_jump_label_transform_apply() and kick all the other CPUs with an IPI at the end of patching. Cc: Alexander Potapenko Cc: Mark Rutland Cc: Marc Zyngier Fixes: f6cc0c501649 ("arm64: Avoid calling stop_machine() when patching jump labels") Signed-off-by: Will Deacon Reviewed-by: Catalin Marinas Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240731133601.3073-1-will@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/jump_label.h | 1 + arch/arm64/kernel/jump_label.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index 4e753908b801..a0a5bbae7229 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -13,6 +13,7 @@ #include #include +#define HAVE_JUMP_LABEL_BATCH #define JUMP_LABEL_NOP_SIZE AARCH64_INSN_SIZE #define JUMP_TABLE_ENTRY(key, label) \ diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index faf88ec9c48e..f63ea915d6ad 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -7,11 +7,12 @@ */ #include #include +#include #include #include -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { void *addr = (void *)jump_entry_code(entry); u32 insn; @@ -25,4 +26,10 @@ void arch_jump_label_transform(struct jump_entry *entry, } aarch64_insn_patch_text_nosync(addr, insn); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + kick_all_cpus_sync(); } From 4b96024ef2296b1d323af327cae5e52809b61420 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 31 Jul 2024 10:23:39 -0300 Subject: [PATCH 0469/1052] smb: client: handle lack of FSCTL_GET_REPARSE_POINT support As per MS-FSA 2.1.5.10.14, support for FSCTL_GET_REPARSE_POINT is optional and if the server doesn't support it, STATUS_INVALID_DEVICE_REQUEST must be returned for the operation. If we find files with reparse points and we can't read them due to lack of client or server support, just ignore it and then treat them as regular files or junctions. Fixes: 5f71ebc41294 ("smb: client: parse reparse point flag in create response") Reported-by: Sebastian Steinbeisser Tested-by: Sebastian Steinbeisser Acked-by: Tom Talpey Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/inode.c | 17 +++++++++++++++-- fs/smb/client/reparse.c | 4 ++++ fs/smb/client/reparse.h | 19 +++++++++++++++++-- fs/smb/client/smb2inode.c | 2 ++ 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 4a8aa1de9522..dd0afa23734c 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1042,13 +1042,26 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, } rc = -EOPNOTSUPP; - switch ((data->reparse.tag = tag)) { - case 0: /* SMB1 symlink */ + data->reparse.tag = tag; + if (!data->reparse.tag) { if (server->ops->query_symlink) { rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, &data->symlink_target); } + if (rc == -EOPNOTSUPP) + data->reparse.tag = IO_REPARSE_TAG_INTERNAL; + } + + switch (data->reparse.tag) { + case 0: /* SMB1 symlink */ + break; + case IO_REPARSE_TAG_INTERNAL: + rc = 0; + if (le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY) { + cifs_create_junction_fattr(fattr, sb); + goto out; + } break; case IO_REPARSE_TAG_MOUNT_POINT: cifs_create_junction_fattr(fattr, sb); diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index a0ffbda90733..689d8a506d45 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -505,6 +505,10 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, } switch (tag) { + case IO_REPARSE_TAG_INTERNAL: + if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY)) + return false; + fallthrough; case IO_REPARSE_TAG_DFS: case IO_REPARSE_TAG_DFSR: case IO_REPARSE_TAG_MOUNT_POINT: diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 6b55d1df9e2f..2c0644bc4e65 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -12,6 +12,12 @@ #include "fs_context.h" #include "cifsglob.h" +/* + * Used only by cifs.ko to ignore reparse points from files when client or + * server doesn't support FSCTL_GET_REPARSE_POINT. + */ +#define IO_REPARSE_TAG_INTERNAL ((__u32)~0U) + static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf) { u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer); @@ -78,10 +84,19 @@ static inline u32 reparse_mode_wsl_tag(mode_t mode) static inline bool reparse_inode_match(struct inode *inode, struct cifs_fattr *fattr) { + struct cifsInodeInfo *cinode = CIFS_I(inode); struct timespec64 ctime = inode_get_ctime(inode); - return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) && - CIFS_I(inode)->reparse_tag == fattr->cf_cifstag && + /* + * Do not match reparse tags when client or server doesn't support + * FSCTL_GET_REPARSE_POINT. @fattr->cf_cifstag should contain correct + * reparse tag from query dir response but the client won't be able to + * read the reparse point data anyway. This spares us a revalidation. + */ + if (cinode->reparse_tag != IO_REPARSE_TAG_INTERNAL && + cinode->reparse_tag != fattr->cf_cifstag) + return false; + return (cinode->cifsAttrs & ATTR_REPARSE) && timespec64_equal(&ctime, &fattr->cf_ctime); } diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 5c02a12251c8..062b86a4936f 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -930,6 +930,8 @@ int smb2_query_path_info(const unsigned int xid, switch (rc) { case 0: + rc = parse_create_response(data, cifs_sb, &out_iov[0]); + break; case -EOPNOTSUPP: /* * BB TODO: When support for special files added to Samba From cd936507986e38535e7c98940c186e9a97b87184 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 31 Jul 2024 11:30:00 +0100 Subject: [PATCH 0470/1052] cifs: Remove cifs_aio_ctx Remove struct cifs_aio_ctx and its associated alloc/release functions as it is no longer used, the functions being taken over by netfslib. Signed-off-by: David Howells cc: Steve French cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 24 ----------------- fs/smb/client/cifsproto.h | 2 -- fs/smb/client/misc.c | 54 --------------------------------------- 3 files changed, 80 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 8e86fec7dcd2..f6d1f075987f 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1471,29 +1471,6 @@ struct cifs_io_parms { struct TCP_Server_Info *server; }; -struct cifs_aio_ctx { - struct kref refcount; - struct list_head list; - struct mutex aio_mutex; - struct completion done; - struct iov_iter iter; - struct kiocb *iocb; - struct cifsFileInfo *cfile; - struct bio_vec *bv; - loff_t pos; - unsigned int nr_pinned_pages; - ssize_t rc; - unsigned int len; - unsigned int total_len; - unsigned int bv_need_unpin; /* If ->bv[] needs unpinning */ - bool should_dirty; - /* - * Indicates if this aio_ctx is for direct_io, - * If yes, iter is a copy of the user passed iov_iter - */ - bool direct_io; -}; - struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; @@ -2010,7 +1987,6 @@ require use of the stronger protocol */ * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search * ->oplock_break_cancelled - * cifs_aio_ctx->aio_mutex cifs_aio_ctx cifs_aio_ctx_alloc ****************************************************************************/ #ifdef DECLARE_GLOBALS_HERE diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c15bb5ee7eb7..497bf3c447bc 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -619,8 +619,6 @@ int __cifs_calc_signature(struct smb_rqst *rqst, struct shash_desc *shash); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); -struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); -void cifs_aio_ctx_release(struct kref *refcount); int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 07c468ddb88a..b28ff62f1f15 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -995,60 +995,6 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, return rc; } -struct cifs_aio_ctx * -cifs_aio_ctx_alloc(void) -{ - struct cifs_aio_ctx *ctx; - - /* - * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io - * to false so that we know when we have to unreference pages within - * cifs_aio_ctx_release() - */ - ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); - if (!ctx) - return NULL; - - INIT_LIST_HEAD(&ctx->list); - mutex_init(&ctx->aio_mutex); - init_completion(&ctx->done); - kref_init(&ctx->refcount); - return ctx; -} - -void -cifs_aio_ctx_release(struct kref *refcount) -{ - struct cifs_aio_ctx *ctx = container_of(refcount, - struct cifs_aio_ctx, refcount); - - cifsFileInfo_put(ctx->cfile); - - /* - * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly - * which means that iov_iter_extract_pages() was a success and thus - * that we may have references or pins on pages that we need to - * release. - */ - if (ctx->bv) { - if (ctx->should_dirty || ctx->bv_need_unpin) { - unsigned int i; - - for (i = 0; i < ctx->nr_pinned_pages; i++) { - struct page *page = ctx->bv[i].bv_page; - - if (ctx->should_dirty) - set_page_dirty(page); - if (ctx->bv_need_unpin) - unpin_user_page(page); - } - } - kvfree(ctx->bv); - } - - kfree(ctx); -} - /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo From 69ca1f57555f74142a4c241703c307cb31b54667 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 30 Jul 2024 00:26:21 -0500 Subject: [PATCH 0471/1052] smb3: add dynamic tracepoints for shutdown ioctl For debugging an umount failure in xfstests generic/043 generic/044 in some configurations, we needed more information on the shutdown ioctl which was suspected of being related to the cause, so tracepoints are added in this patch e.g. "trace-cmd record -e smb3_shutdown_enter -e smb3_shutdown_done -e smb3_shutdown_err" Sample output: godown-47084 [011] ..... 3313.756965: smb3_shutdown_enter: flags=0x1 tid=0x733b3e75 godown-47084 [011] ..... 3313.756968: smb3_shutdown_done: flags=0x1 tid=0x733b3e75 Tested-by: Anthony Nandaa (Microsoft) Signed-off-by: Steve French --- fs/smb/client/ioctl.c | 32 +++++++++++++++++++++------ fs/smb/client/trace.h | 51 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 855ac5a62edf..44dbaf9929a4 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -170,7 +170,10 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, static int cifs_shutdown(struct super_block *sb, unsigned long arg) { struct cifs_sb_info *sbi = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *tcon; __u32 flags; + int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -178,14 +181,21 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) if (get_user(flags, (__u32 __user *)arg)) return -EFAULT; - if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH) - return -EINVAL; + tlink = cifs_sb_tlink(sbi); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + trace_smb3_shutdown_enter(flags, tcon->tid); + if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH) { + rc = -EINVAL; + goto shutdown_out_err; + } if (cifs_forced_shutdown(sbi)) - return 0; + goto shutdown_good; cifs_dbg(VFS, "shut down requested (%d)", flags); -/* trace_cifs_shutdown(sb, flags);*/ /* * see: @@ -201,7 +211,8 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) */ case CIFS_GOING_FLAGS_DEFAULT: cifs_dbg(FYI, "shutdown with default flag not supported\n"); - return -EINVAL; + rc = -EINVAL; + goto shutdown_out_err; /* * FLAGS_LOGFLUSH is easy since it asks to write out metadata (not * data) but metadata writes are not cached on the client, so can treat @@ -210,11 +221,18 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) case CIFS_GOING_FLAGS_LOGFLUSH: case CIFS_GOING_FLAGS_NOLOGFLUSH: sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN; - return 0; + goto shutdown_good; default: - return -EINVAL; + rc = -EINVAL; + goto shutdown_out_err; } + +shutdown_good: + trace_smb3_shutdown_done(flags, tcon->tid); return 0; +shutdown_out_err: + trace_smb3_shutdown_err(rc, flags, tcon->tid); + return rc; } static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in) diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 6b3bdfb97211..0f0c10c7ada7 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -1388,7 +1388,7 @@ DECLARE_EVENT_CLASS(smb3_ioctl_class, __entry->command = command; ), TP_printk("xid=%u fid=0x%llx ioctl cmd=0x%x", - __entry->xid, __entry->fid, __entry->command) + __entry->xid, __entry->fid, __entry->command) ) #define DEFINE_SMB3_IOCTL_EVENT(name) \ @@ -1400,9 +1400,58 @@ DEFINE_EVENT(smb3_ioctl_class, smb3_##name, \ DEFINE_SMB3_IOCTL_EVENT(ioctl); +DECLARE_EVENT_CLASS(smb3_shutdown_class, + TP_PROTO(__u32 flags, + __u32 tid), + TP_ARGS(flags, tid), + TP_STRUCT__entry( + __field(__u32, flags) + __field(__u32, tid) + ), + TP_fast_assign( + __entry->flags = flags; + __entry->tid = tid; + ), + TP_printk("flags=0x%x tid=0x%x", + __entry->flags, __entry->tid) +) +#define DEFINE_SMB3_SHUTDOWN_EVENT(name) \ +DEFINE_EVENT(smb3_shutdown_class, smb3_##name, \ + TP_PROTO(__u32 flags, \ + __u32 tid), \ + TP_ARGS(flags, tid)) +DEFINE_SMB3_SHUTDOWN_EVENT(shutdown_enter); +DEFINE_SMB3_SHUTDOWN_EVENT(shutdown_done); +DECLARE_EVENT_CLASS(smb3_shutdown_err_class, + TP_PROTO(int rc, + __u32 flags, + __u32 tid), + TP_ARGS(rc, flags, tid), + TP_STRUCT__entry( + __field(int, rc) + __field(__u32, flags) + __field(__u32, tid) + ), + TP_fast_assign( + __entry->rc = rc; + __entry->flags = flags; + __entry->tid = tid; + ), + TP_printk("rc=%d flags=0x%x tid=0x%x", + __entry->rc, __entry->flags, __entry->tid) +) + +#define DEFINE_SMB3_SHUTDOWN_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_shutdown_err_class, smb3_##name, \ + TP_PROTO(int rc, \ + __u32 flags, \ + __u32 tid), \ + TP_ARGS(rc, flags, tid)) + +DEFINE_SMB3_SHUTDOWN_ERR_EVENT(shutdown_err); DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, From ddecea00f87f0c46e9c8339a7c89fb2ff891521a Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 1 Aug 2024 18:12:39 -0300 Subject: [PATCH 0472/1052] smb: client: fix FSCTL_GET_REPARSE_POINT against NetApp NetApp server requires the file to be open with FILE_READ_EA access in order to support FSCTL_GET_REPARSE_POINT, otherwise it will return STATUS_INVALID_DEVICE_REQUEST. It doesn't make any sense because there's no requirement for FILE_READ_EA bit to be set nor STATUS_INVALID_DEVICE_REQUEST being used for something other than "unsupported reparse points" in MS-FSA. To fix it and improve compatibility, set FILE_READ_EA & SYNCHRONIZE bits to match what Windows client currently does. Tested-by: Sebastian Steinbeisser Acked-by: Tom Talpey Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 062b86a4936f..9f5bc41433c1 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -950,7 +950,8 @@ int smb2_query_path_info(const unsigned int xid, cmds[num_cmds++] = SMB2_OP_GET_REPARSE; oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, - FILE_READ_ATTRIBUTES | FILE_READ_EA, + FILE_READ_ATTRIBUTES | + FILE_READ_EA | SYNCHRONIZE, FILE_OPEN, create_options | OPEN_REPARSE_POINT, ACL_NO_MODE); cifs_get_readable_path(tcon, full_path, &cfile); @@ -1258,7 +1259,8 @@ int smb2_query_reparse_point(const unsigned int xid, cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); cifs_get_readable_path(tcon, full_path, &cfile); - oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES, + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + FILE_READ_ATTRIBUTES | FILE_READ_EA | SYNCHRONIZE, FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, &in_iov, From a91bfa67601c07ff9d31731fd2d624b47b0039f2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 26 Jul 2024 18:44:16 -0500 Subject: [PATCH 0473/1052] cifs: update internal version number To 2.50 Signed-off-by: Steve French --- fs/smb/client/cifsfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 62d5fee3e5eb..ca2bd204bcc5 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -147,6 +147,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 49 -#define CIFS_VERSION "2.49" +#define SMB3_PRODUCT_BUILD 50 +#define CIFS_VERSION "2.50" #endif /* _CIFSFS_H */ From f2655ac2c06a15558e51ed6529de280e1553c86e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Aug 2024 08:46:15 -0700 Subject: [PATCH 0474/1052] clocksource: Fix brown-bag boolean thinko in cs_watchdog_read() The current "nretries > 1 || nretries >= max_retries" check in cs_watchdog_read() will always evaluate to true, and thus pr_warn(), if nretries is greater than 1. The intent is instead to never warn on the first try, but otherwise warn if the successful retry was the last retry. Therefore, change that "||" to "&&". Fixes: db3a34e17433 ("clocksource: Retry clock read if long delays detected") Reported-by: Borislav Petkov Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240802154618.4149953-2-paulmck@kernel.org --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d25ba49e313c..d0538a75f4c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -246,7 +246,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); if (wd_delay <= WATCHDOG_MAX_SKEW) { - if (nretries > 1 || nretries >= max_retries) { + if (nretries > 1 && nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); } From 90ec3a8a7fd0d43026fcca979713e077d4883b56 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 2 Aug 2024 16:22:13 +0100 Subject: [PATCH 0475/1052] spi: Add empty versions of ACPI functions Provide empty versions of acpi_spi_count_resources(), acpi_spi_device_alloc() and acpi_spi_find_controller_by_adev() if the real functions are not being built. This commit fixes two problems with the original definitions: 1) There wasn't an empty version of these functions 2) The #if only depended on CONFIG_ACPI. But the functions are implemented in the core spi.c so CONFIG_SPI_MASTER must also be enabled for the real functions to exist. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20240802152215.20831-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e4f3f3d30a03..d47d5f14ff99 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -902,12 +902,29 @@ extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); extern void spi_unregister_controller(struct spi_controller *ctlr); -#if IS_ENABLED(CONFIG_ACPI) +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SPI_MASTER) extern struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev); extern struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index); int acpi_spi_count_resources(struct acpi_device *adev); +#else +static inline struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev) +{ + return NULL; +} + +static inline struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, + struct acpi_device *adev, + int index) +{ + return ERR_PTR(-ENODEV); +} + +static inline int acpi_spi_count_resources(struct acpi_device *adev) +{ + return 0; +} #endif /* From 32b9a52f88a5713bf8a02dae66f2ad69705de69f Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 23 Jul 2024 16:20:52 +0200 Subject: [PATCH 0476/1052] KVM: arm64: free kvm->arch.nested_mmus with kvfree() kvm->arch.nested_mmus is allocated with kvrealloc(), hence free it with kvfree() instead of kfree(). Fixes: 4f128f8e1aaa ("KVM: arm64: nv: Support multiple nested Stage-2 mmu structures") Signed-off-by: Danilo Krummrich Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723142204.758796-1-dakr@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index de789e0f1ae9..bab27f9d8cc6 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -786,7 +786,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) if (!WARN_ON(atomic_read(&mmu->refcnt))) kvm_free_stage2_pgd(mmu); } - kfree(kvm->arch.nested_mmus); + kvfree(kvm->arch.nested_mmus); kvm->arch.nested_mmus = NULL; kvm->arch.nested_mmus_size = 0; kvm_uninit_stage2_mmu(kvm); From 963a08e586bd45fd55f4c1752e98029ce83fc091 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Jul 2024 12:12:02 +0200 Subject: [PATCH 0477/1052] KVM: arm64: fix override-init warnings in W=1 builds Add -Wno-override-init to the build flags for sys_regs.c, handle_exit.c, and switch.c to fix warnings like the following: arch/arm64/kvm/hyp/vhe/switch.c:271:43: warning: initialized field overwritten [-Woverride-init] 271 | [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, | Signed-off-by: Sebastian Ott Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723101204.7356-2-sebott@redhat.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/Makefile | 3 +++ arch/arm64/kvm/hyp/nvhe/Makefile | 2 ++ arch/arm64/kvm/hyp/vhe/Makefile | 2 ++ 3 files changed, 7 insertions(+) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index a6497228c5a8..86a629aaf0a1 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -10,6 +10,9 @@ include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ +CFLAGS_sys_regs.o += -Wno-override-init +CFLAGS_handle_exit.o += -Wno-override-init + kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 782b34b004be..b43426a493df 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -20,6 +20,8 @@ HOST_EXTRACFLAGS += -I$(objtree)/include lib-objs := clear_page.o copy_page.o memcpy.o memset.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) +CFLAGS_switch.nvhe.o += -Wno-override-init + hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index 3b9e5464b5b3..afc4aed9231a 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -6,6 +6,8 @@ asflags-y := -D__KVM_VHE_HYPERVISOR__ ccflags-y := -D__KVM_VHE_HYPERVISOR__ +CFLAGS_switch.o += -Wno-override-init + obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o From 0aa34b37a78d063da58838b84b20a68a94d919fd Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Jul 2024 12:12:03 +0200 Subject: [PATCH 0478/1052] KVM: arm64: fix kdoc warnings in W=1 builds Fix kdoc warnings by adding missing function parameter descriptions or by conversion to a normal comment. Signed-off-by: Sebastian Ott Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723101204.7356-3-sebott@redhat.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/vgic/vgic-irqfd.c | 7 ++++--- arch/arm64/kvm/vgic/vgic-its.c | 18 +++++++++++------- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a7ca776b51ec..23e1fa56c02d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -164,6 +164,7 @@ static int kvm_arm_default_max_vcpus(void) /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct + * @type: kvm device type */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 8c711deb25aa..c314c016659a 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -9,7 +9,7 @@ #include #include "vgic.h" -/** +/* * vgic_irqfd_set_irq: inject the IRQ corresponding to the * irqchip routing entry * @@ -75,7 +75,8 @@ static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, msi->flags = e->msi.flags; msi->devid = e->msi.devid; } -/** + +/* * kvm_set_msi: inject the MSI corresponding to the * MSI routing entry * @@ -98,7 +99,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return vgic_its_inject_msi(kvm, &msi); } -/** +/* * kvm_arch_set_irq_inatomic: fast-path for irqfd injection */ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 40bb43f20bf3..ba945ba78cc7 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2040,6 +2040,7 @@ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, * @start_id: the ID of the first entry in the table * (non zero for 2d level tables) * @fn: function to apply on each entry + * @opaque: pointer to opaque data * * Return: < 0 on error, 0 if last element was identified, 1 otherwise * (the last element may not be found on second level tables) @@ -2079,7 +2080,7 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, return 1; } -/** +/* * vgic_its_save_ite - Save an interrupt translation entry at @gpa */ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, @@ -2099,6 +2100,8 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, /** * vgic_its_restore_ite - restore an interrupt translation entry + * + * @its: its handle * @event_id: id used for indexing * @ptr: pointer to the ITE entry * @opaque: pointer to the its_device @@ -2231,6 +2234,7 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) * @its: ITS handle * @dev: ITS device * @ptr: GPA + * @dte_esz: device table entry size */ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, gpa_t ptr, int dte_esz) @@ -2313,7 +2317,7 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a, return 1; } -/** +/* * vgic_its_save_device_tables - Save the device table and all ITT * into guest RAM * @@ -2386,7 +2390,7 @@ static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, return ret; } -/** +/* * vgic_its_restore_device_tables - Restore the device table and all ITT * from guest RAM to internal data structs */ @@ -2478,7 +2482,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) return 1; } -/** +/* * vgic_its_save_collection_table - Save the collection table into * guest RAM */ @@ -2518,7 +2522,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) return ret; } -/** +/* * vgic_its_restore_collection_table - reads the collection table * in guest memory and restores the ITS internal state. Requires the * BASER registers to be restored before. @@ -2556,7 +2560,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) return ret; } -/** +/* * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM * according to v0 ABI */ @@ -2571,7 +2575,7 @@ static int vgic_its_save_tables_v0(struct vgic_its *its) return vgic_its_save_collection_table(its); } -/** +/* * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM * to internal data structs according to V0 ABI * diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index ed6e412cd74b..3eecdd2f4b8f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -370,7 +370,7 @@ static void map_all_vpes(struct kvm *kvm) dist->its_vm.vpes[i]->irq)); } -/** +/* * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ From 19d837bc881b2f9f72f9eb506b46c2e2d983896d Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Jul 2024 12:12:04 +0200 Subject: [PATCH 0479/1052] KVM: arm64: vgic: fix unexpected unlock sparse warnings Get rid of unexpected unlock sparse warnings in vgic code by adding an annotation to vgic_queue_irq_unlock(). arch/arm64/kvm/vgic/vgic.c:334:17: warning: context imbalance in 'vgic_queue_irq_unlock' - unexpected unlock arch/arm64/kvm/vgic/vgic.c:419:5: warning: context imbalance in 'kvm_vgic_inject_irq' - different lock contexts for basic block Signed-off-by: Sebastian Ott Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723101204.7356-4-sebott@redhat.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 2 +- arch/arm64/kvm/vgic/vgic.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index f07b3ddff7d4..974849ea7101 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -313,7 +313,7 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne * with all locks dropped. */ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags) + unsigned long flags) __releases(&irq->irq_lock) { struct kvm_vcpu *vcpu; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 03d356a12377..ba8f790431bd 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -186,7 +186,7 @@ bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags); + unsigned long flags) __releases(&irq->irq_lock); void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); From 3d650ab5e7d9c4d7306e4c116f8aa9980bf13295 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 2 Aug 2024 11:54:34 -0700 Subject: [PATCH 0480/1052] selftests/bpf: Fix a btf_dump selftest failure Jakub reported bpf selftest "btf_dump" failure after forwarding to v6.11-rc1 with netdev. Error: #33 btf_dump Error: #33/15 btf_dump/btf_dump: var_data btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0 The reason for the failure is due to commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables") where percpu static variable "cpu_profile_flip" is removed. Let us replace "cpu_profile_flip" with a variable in bpf subsystem so whenever that variable gets deleted or renamed, we can detect the failure immediately. In this case, I picked a static percpu variable "bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c. Reported-by: Jakub Kicinski Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240802185434.1749056-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 09a8e6f9b379..b293b8501fd6 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, "int cpu_number = (int)100", 100); #endif - TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, - "static int cpu_profile_flip = (int)2", 2); + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, + "static int bpf_cgrp_storage_busy = (int)2", 2); } static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, From e0391e92f9ab4fb3dbdeb139c967dcfa7ac4b115 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 2 Aug 2024 09:38:51 +0100 Subject: [PATCH 0481/1052] btrfs: fix double inode unlock for direct IO sync writes If we do a direct IO sync write, at btrfs_sync_file(), and we need to skip inode logging or we get an error starting a transaction or an error when flushing delalloc, we end up unlocking the inode when we shouldn't under the 'out_release_extents' label, and then unlock it again at btrfs_direct_write(). Fix that by checking if we have to skip inode unlocking under that label. Reported-by: syzbot+7dbbb74af6291b5a5a8b@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/000000000000dfd631061eaeb4bc@google.com/ Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write") Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9f10a9f23fcc..9914419f3b7d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1868,7 +1868,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } From 12653ec36112ab55fa06c01db7c4432653d30a8d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 19 Jul 2024 18:56:46 +0930 Subject: [PATCH 0482/1052] btrfs: avoid using fixed char array size for tree names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a bug report that using the latest trunk GCC 15, btrfs would cause unterminated-string-initialization warning: linux-6.6/fs/btrfs/print-tree.c:29:49: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization] 29 | { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, | ^~~~~~~~~~~~~~~~~~ [CAUSE] To print tree names we have an array of root_name_map structure, which uses "char name[16];" to store the name string of a tree. But the following trees have names exactly at 16 chars length: - "BLOCK_GROUP_TREE" - "RAID_STRIPE_TREE" This means we will have no space for the terminating '\0', and can lead to unexpected access when printing the name. [FIX] Instead of "char name[16];" use "const char *" instead. Since the name strings are all read-only data, and are all NULL terminated by default, there is not much need to bother the length at all. Reported-by: Sam James Reported-by: Alejandro Colomar Fixes: edde81f1abf29 ("btrfs: add raid stripe tree pretty printer") Fixes: 9c54e80ddc6bd ("btrfs: add code to support the block group root") CC: stable@vger.kernel.org # 6.1+ Suggested-by: Alejandro Colomar Reviewed-by: Johannes Thumshirn Reviewed-by: Alejandro Colomar Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 32dcea662da3..fc821aa446f0 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -14,7 +14,7 @@ struct root_name_map { u64 id; - char name[16]; + const char *name; }; static const struct root_name_map root_map[] = { From eeef5f183f1c1bdc3ea42b26ded1da2a8a5c69d9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 1 Aug 2024 09:28:42 -0700 Subject: [PATCH 0483/1052] MAINTAINERS: update status of sky2 and skge drivers The old SysKonnect NIc's are not used or actively maintained anymore. My sky2 NIC's are all in box in back corner of attic. Signed-off-by: Stephen Hemminger Link: https://patch.msgid.link/20240801162930.212299-1-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8766f3e5e87e..714e113d6eed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13539,7 +13539,7 @@ MARVELL GIGABIT ETHERNET DRIVERS (skge/sky2) M: Mirko Lindner M: Stephen Hemminger L: netdev@vger.kernel.org -S: Maintained +S: Odd fixes F: drivers/net/ethernet/marvell/sk* MARVELL LIBERTAS WIRELESS DRIVER From 9ab0faa7f9ffe31296dbb9bbe6f76c72c14eea18 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 31 Jul 2024 16:46:24 -0700 Subject: [PATCH 0484/1052] sctp: Fix null-ptr-deref in reuseport_add_sock(). syzbot reported a null-ptr-deref while accessing sk2->sk_reuseport_cb in reuseport_add_sock(). [0] The repro first creates a listener with SO_REUSEPORT. Then, it creates another listener on the same port and concurrently closes the first listener. The second listen() calls reuseport_add_sock() with the first listener as sk2, where sk2->sk_reuseport_cb is not expected to be cleared concurrently, but the close() does clear it by reuseport_detach_sock(). The problem is SCTP does not properly synchronise reuseport_alloc(), reuseport_add_sock(), and reuseport_detach_sock(). The caller of reuseport_alloc() and reuseport_{add,detach}_sock() must provide synchronisation for sockets that are classified into the same reuseport group. Otherwise, such sockets form multiple identical reuseport groups, and all groups except one would be silently dead. 1. Two sockets call listen() concurrently 2. No socket in the same group found in sctp_ep_hashtable[] 3. Two sockets call reuseport_alloc() and form two reuseport groups 4. Only one group hit first in __sctp_rcv_lookup_endpoint() receives incoming packets Also, the reported null-ptr-deref could occur. TCP/UDP guarantees that would not happen by holding the hash bucket lock. Let's apply the locking strategy to __sctp_hash_endpoint() and __sctp_unhash_endpoint(). [0]: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 1 UID: 0 PID: 10230 Comm: syz-executor119 Not tainted 6.10.0-syzkaller-12585-g301927d2d2eb #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024 RIP: 0010:reuseport_add_sock+0x27e/0x5e0 net/core/sock_reuseport.c:350 Code: 00 0f b7 5d 00 bf 01 00 00 00 89 de e8 1b a4 ff f7 83 fb 01 0f 85 a3 01 00 00 e8 6d a0 ff f7 49 8d 7e 12 48 89 f8 48 c1 e8 03 <42> 0f b6 04 28 84 c0 0f 85 4b 02 00 00 41 0f b7 5e 12 49 8d 7e 14 RSP: 0018:ffffc9000b947c98 EFLAGS: 00010202 RAX: 0000000000000002 RBX: ffff8880252ddf98 RCX: ffff888079478000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000012 RBP: 0000000000000001 R08: ffffffff8993e18d R09: 1ffffffff1fef385 R10: dffffc0000000000 R11: fffffbfff1fef386 R12: ffff8880252ddac0 R13: dffffc0000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f24e45b96c0(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffcced5f7b8 CR3: 00000000241be000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __sctp_hash_endpoint net/sctp/input.c:762 [inline] sctp_hash_endpoint+0x52a/0x600 net/sctp/input.c:790 sctp_listen_start net/sctp/socket.c:8570 [inline] sctp_inet_listen+0x767/0xa20 net/sctp/socket.c:8625 __sys_listen_socket net/socket.c:1883 [inline] __sys_listen+0x1b7/0x230 net/socket.c:1894 __do_sys_listen net/socket.c:1902 [inline] __se_sys_listen net/socket.c:1900 [inline] __x64_sys_listen+0x5a/0x70 net/socket.c:1900 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f24e46039b9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 91 1a 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f24e45b9228 EFLAGS: 00000246 ORIG_RAX: 0000000000000032 RAX: ffffffffffffffda RBX: 00007f24e468e428 RCX: 00007f24e46039b9 RDX: 00007f24e46039b9 RSI: 0000000000000003 RDI: 0000000000000004 RBP: 00007f24e468e420 R08: 00007f24e45b96c0 R09: 00007f24e45b96c0 R10: 00007f24e45b96c0 R11: 0000000000000246 R12: 00007f24e468e42c R13: 00007f24e465a5dc R14: 0020000000000001 R15: 00007ffcced5f7d8 Modules linked in: Fixes: 6ba845740267 ("sctp: process sk_reuseport in sctp_get_port_local") Reported-by: syzbot+e6979a5d2f10ecb700e4@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=e6979a5d2f10ecb700e4 Tested-by: syzbot+e6979a5d2f10ecb700e4@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Acked-by: Xin Long Link: https://patch.msgid.link/20240731234624.94055-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/sctp/input.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 17fcaa9b0df9..a8a254a5008e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -735,15 +735,19 @@ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); struct sctp_hashbucket *head; + int err = 0; ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; + write_lock(&head->lock); if (sk->sk_reuseport) { bool any = sctp_is_ep_boundall(sk); struct sctp_endpoint *ep2; struct list_head *list; - int cnt = 0, err = 1; + int cnt = 0; + + err = 1; list_for_each(list, &ep->base.bind_addr.address_list) cnt++; @@ -761,24 +765,24 @@ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) if (!err) { err = reuseport_add_sock(sk, sk2, any); if (err) - return err; + goto out; break; } else if (err < 0) { - return err; + goto out; } } if (err) { err = reuseport_alloc(sk, any); if (err) - return err; + goto out; } } - write_lock(&head->lock); hlist_add_head(&ep->node, &head->chain); +out: write_unlock(&head->lock); - return 0; + return err; } /* Add an endpoint to the hash. Local BH-safe. */ @@ -803,10 +807,9 @@ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) head = &sctp_ep_hashtable[ep->hashent]; + write_lock(&head->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - - write_lock(&head->lock); hlist_del_init(&ep->node); write_unlock(&head->lock); } From 89108cb5c28527c1882df2987394e5c261a1f4aa Mon Sep 17 00:00:00 2001 From: Kyle Swenson Date: Wed, 31 Jul 2024 15:42:14 +0000 Subject: [PATCH 0485/1052] net: pse-pd: tps23881: Fix the device ID check The DEVID register contains two pieces of information: the device ID in the upper nibble, and the silicon revision number in the lower nibble. The driver should work fine with any silicon revision, so let's mask that out in the device ID check. Fixes: 20e6d190ffe1 ("net: pse-pd: Add TI TPS23881 PSE controller driver") Signed-off-by: Kyle Swenson Reviewed-by: Thomas Petazzoni Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20240731154152.4020668-1-kyle.swenson@est.tech Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/tps23881.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/pse-pd/tps23881.c b/drivers/net/pse-pd/tps23881.c index 61f6ad9c1934..f90db758554b 100644 --- a/drivers/net/pse-pd/tps23881.c +++ b/drivers/net/pse-pd/tps23881.c @@ -29,6 +29,8 @@ #define TPS23881_REG_TPON BIT(0) #define TPS23881_REG_FWREV 0x41 #define TPS23881_REG_DEVID 0x43 +#define TPS23881_REG_DEVID_MASK 0xF0 +#define TPS23881_DEVICE_ID 0x02 #define TPS23881_REG_SRAM_CTRL 0x60 #define TPS23881_REG_SRAM_DATA 0x61 @@ -750,7 +752,7 @@ static int tps23881_i2c_probe(struct i2c_client *client) if (ret < 0) return ret; - if (ret != 0x22) { + if (FIELD_GET(TPS23881_REG_DEVID_MASK, ret) != TPS23881_DEVICE_ID) { dev_err(dev, "Wrong device ID\n"); return -ENXIO; } From fba917b169bea5f8f2ee300e19d5f7a6341a5251 Mon Sep 17 00:00:00 2001 From: Praveen Kaligineedi Date: Thu, 1 Aug 2024 13:56:19 -0700 Subject: [PATCH 0486/1052] gve: Fix use of netif_carrier_ok() GVE driver wrongly relies on netif_carrier_ok() to check the interface administrative state when resources are being allocated/deallocated for queue(s). netif_carrier_ok() needs to be replaced with netif_running() for all such cases. Administrative state is the result of "ip link set dev up/down". It reflects whether the administrator wants to use the device for traffic and the corresponding resources have been allocated. Fixes: 5f08cd3d6423 ("gve: Alloc before freeing when adjusting queues") Signed-off-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240801205619.987396-1-pkaligineedi@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_ethtool.c | 2 +- drivers/net/ethernet/google/gve/gve_main.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 3480ff5c7ed6..5a8b490ab3ad 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -495,7 +495,7 @@ static int gve_set_channels(struct net_device *netdev, return -EINVAL; } - if (!netif_carrier_ok(netdev)) { + if (!netif_running(netdev)) { priv->tx_cfg.num_queues = new_tx; priv->rx_cfg.num_queues = new_rx; return 0; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 9744b426940e..661566db68c8 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1566,7 +1566,7 @@ static int gve_set_xdp(struct gve_priv *priv, struct bpf_prog *prog, u32 status; old_prog = READ_ONCE(priv->xdp_prog); - if (!netif_carrier_ok(priv->dev)) { + if (!netif_running(priv->dev)) { WRITE_ONCE(priv->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); @@ -1847,7 +1847,7 @@ int gve_adjust_queues(struct gve_priv *priv, rx_alloc_cfg.qcfg = &new_rx_config; tx_alloc_cfg.num_rings = new_tx_config.num_queues; - if (netif_carrier_ok(priv->dev)) { + if (netif_running(priv->dev)) { err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg); return err; } @@ -2064,7 +2064,7 @@ static int gve_set_features(struct net_device *netdev, if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) { netdev->features ^= NETIF_F_LRO; - if (netif_carrier_ok(netdev)) { + if (netif_running(netdev)) { err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg); if (err) goto revert_features; @@ -2359,7 +2359,7 @@ static int gve_reset_recovery(struct gve_priv *priv, bool was_up) int gve_reset(struct gve_priv *priv, bool attempt_teardown) { - bool was_up = netif_carrier_ok(priv->dev); + bool was_up = netif_running(priv->dev); int err; dev_info(&priv->pdev->dev, "Performing reset\n"); @@ -2700,7 +2700,7 @@ static void gve_shutdown(struct pci_dev *pdev) { struct net_device *netdev = pci_get_drvdata(pdev); struct gve_priv *priv = netdev_priv(netdev); - bool was_up = netif_carrier_ok(priv->dev); + bool was_up = netif_running(priv->dev); rtnl_lock(); if (was_up && gve_close(priv->dev)) { @@ -2718,7 +2718,7 @@ static int gve_suspend(struct pci_dev *pdev, pm_message_t state) { struct net_device *netdev = pci_get_drvdata(pdev); struct gve_priv *priv = netdev_priv(netdev); - bool was_up = netif_carrier_ok(priv->dev); + bool was_up = netif_running(priv->dev); priv->suspend_cnt++; rtnl_lock(); From f874d7210d882cb1c58a8e3da66f61cdc63cd4b4 Mon Sep 17 00:00:00 2001 From: Li Feng Date: Thu, 18 Jul 2024 16:07:22 +0800 Subject: [PATCH 0487/1052] scsi: sd: Keep the discard mode stable There is a scenario where a large number of discard commands are issued when the iscsi initiator connects to the target and then performs a session rescan operation. There is a time window, most of the commands are in UNMAP mode, and some discard commands become WRITE SAME with UNMAP. The discard mode has been negotiated during the SCSI probe. If the mode is temporarily changed from UNMAP to WRITE SAME with UNMAP, an I/O ERROR may occur because the target may not implement WRITE SAME with UNMAP. Keep the discard mode stable to fix this issue. Signed-off-by: Li Feng Link: https://lore.kernel.org/r/20240718080751.313102-2-fengli@smartx.com Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 718eb91ba9a5..699f4f9674d9 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2711,8 +2711,6 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; - - sd_config_discard(sdkp, lim, SD_LBP_WS16); } sdkp->capacity = lba + 1; @@ -3365,8 +3363,6 @@ static void sd_read_block_limits(struct scsi_disk *sdkp, sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); - sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); - config_atomic: sdkp->max_atomic = get_unaligned_be32(&vpd->data[44]); sdkp->atomic_alignment = get_unaligned_be32(&vpd->data[48]); @@ -3755,6 +3751,8 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_zbc_read_zones(sdkp, &lim, buffer); } + sd_config_discard(sdkp, &lim, sd_discard_mode(sdkp)); + sd_print_capacity(sdkp, old_capacity); sd_read_write_protect_flag(sdkp, buffer); From b6547e54864b998af21fdcaa0a88cf8e7efe641a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Aug 2024 18:12:06 -0700 Subject: [PATCH 0488/1052] runtime constants: deal with old decrepit linkers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The runtime constants linker script depended on documented linker behavior [1]: "If an output section’s name is the same as the input section’s name and is representable as a C identifier, then the linker will automatically PROVIDE two symbols: __start_SECNAME and __stop_SECNAME, where SECNAME is the name of the section. These indicate the start address and end address of the output section respectively" to just automatically define the symbol names for the bounds of the runtime constant arrays. It turns out that this isn't actually something we can rely on, with old linkers not generating these automatic symbols. It looks to have been introduced in binutils-2.29 back in 2017, and we still support building with versions all the way back to binutils-2.25 (from 2015). And yes, Oleg actually seems to be using such ancient versions of binutils. So instead of depending on the implicit symbols from "section names match and are representable C identifiers", just do this all manually. It's not like it causes us any extra pain, we already have to do that for all the other sections that we use that often have special characters in them. Reported-and-tested-by: Oleg Nesterov Link: https://sourceware.org/binutils/docs/ld/Input-Section-Example.html [1] Link: https://lore.kernel.org/all/20240802114518.GA20924@redhat.com/ Signed-off-by: Linus Torvalds --- include/asm-generic/vmlinux.lds.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ad6afc5c4918..1ae44793132a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -911,13 +911,12 @@ #define CON_INITCALL \ BOUNDED_SECTION_POST_LABEL(.con_initcall.init, __con_initcall, _start, _end) -#define RUNTIME_NAME(t,x) runtime_##t##_##x +#define NAMED_SECTION(name) \ + . = ALIGN(8); \ + name : AT(ADDR(name) - LOAD_OFFSET) \ + { BOUNDED_SECTION_PRE_LABEL(name, name, __start_, __stop_) } -#define RUNTIME_CONST(t,x) \ - . = ALIGN(8); \ - RUNTIME_NAME(t,x) : AT(ADDR(RUNTIME_NAME(t,x)) - LOAD_OFFSET) { \ - *(RUNTIME_NAME(t,x)); \ - } +#define RUNTIME_CONST(t,x) NAMED_SECTION(runtime_##t##_##x) /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ #define KUNIT_TABLE() \ From 14ab4792ee120c022f276a7e4768f4dcb08f0cdd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Thu, 1 Aug 2024 01:13:28 +0100 Subject: [PATCH 0489/1052] net/tcp: Disable TCP-AO static key after RCU grace period The lifetime of TCP-AO static_key is the same as the last tcp_ao_info. On the socket destruction tcp_ao_info ceases to be with RCU grace period, while tcp-ao static branch is currently deferred destructed. The static key definition is : DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ); which means that if RCU grace period is delayed by more than a second and tcp_ao_needed is in the process of disablement, other CPUs may yet see tcp_ao_info which atent dead, but soon-to-be. And that breaks the assumption of static_key_fast_inc_not_disabled(). See the comment near the definition: > * The caller must make sure that the static key can't get disabled while > * in this function. It doesn't patch jump labels, only adds a user to > * an already enabled static key. Originally it was introduced in commit eb8c507296f6 ("jump_label: Prevent key->enabled int overflow"), which is needed for the atomic contexts, one of which would be the creation of a full socket from a request socket. In that atomic context, it's known by the presence of the key (md5/ao) that the static branch is already enabled. So, the ref counter for that static branch is just incremented instead of holding the proper mutex. static_key_fast_inc_not_disabled() is just a helper for such usage case. But it must not be used if the static branch could get disabled in parallel as it's not protected by jump_label_mutex and as a result, races with jump_label_update() implementation details. Happened on netdev test-bot[1], so not a theoretical issue: [] jump_label: Fatal kernel bug, unexpected op at tcp_inbound_hash+0x1a7/0x870 [ffffffffa8c4e9b7] (eb 50 0f 1f 44 != 66 90 0f 1f 00)) size:2 type:1 [] ------------[ cut here ]------------ [] kernel BUG at arch/x86/kernel/jump_label.c:73! [] Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI [] CPU: 3 PID: 243 Comm: kworker/3:3 Not tainted 6.10.0-virtme #1 [] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [] Workqueue: events jump_label_update_timeout [] RIP: 0010:__jump_label_patch+0x2f6/0x350 ... [] Call Trace: [] [] arch_jump_label_transform_queue+0x6c/0x110 [] __jump_label_update+0xef/0x350 [] __static_key_slow_dec_cpuslocked.part.0+0x3c/0x60 [] jump_label_update_timeout+0x2c/0x40 [] process_one_work+0xe3b/0x1670 [] worker_thread+0x587/0xce0 [] kthread+0x28a/0x350 [] ret_from_fork+0x31/0x70 [] ret_from_fork_asm+0x1a/0x30 [] [] Modules linked in: veth [] ---[ end trace 0000000000000000 ]--- [] RIP: 0010:__jump_label_patch+0x2f6/0x350 [1]: https://netdev-3.bots.linux.dev/vmksft-tcp-ao-dbg/results/696681/5-connect-deny-ipv6/stderr Cc: stable@kernel.org Fixes: 67fa83f7c86a ("net/tcp: Add static_key for TCP-AO") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ao.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 85531437890c..db6516092daf 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -267,32 +267,49 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head) kfree_sensitive(key); } -void tcp_ao_destroy_sock(struct sock *sk, bool twsk) +static void tcp_ao_info_free_rcu(struct rcu_head *head) { - struct tcp_ao_info *ao; + struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu); struct tcp_ao_key *key; struct hlist_node *n; + hlist_for_each_entry_safe(key, n, &ao->head, node) { + hlist_del(&key->node); + tcp_sigpool_release(key->tcp_sigpool_id); + kfree_sensitive(key); + } + kfree(ao); + static_branch_slow_dec_deferred(&tcp_ao_needed); +} + +static void tcp_ao_sk_omem_free(struct sock *sk, struct tcp_ao_info *ao) +{ + size_t total_ao_sk_mem = 0; + struct tcp_ao_key *key; + + hlist_for_each_entry(key, &ao->head, node) + total_ao_sk_mem += tcp_ao_sizeof_key(key); + atomic_sub(total_ao_sk_mem, &sk->sk_omem_alloc); +} + +void tcp_ao_destroy_sock(struct sock *sk, bool twsk) +{ + struct tcp_ao_info *ao; + if (twsk) { ao = rcu_dereference_protected(tcp_twsk(sk)->ao_info, 1); - tcp_twsk(sk)->ao_info = NULL; + rcu_assign_pointer(tcp_twsk(sk)->ao_info, NULL); } else { ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1); - tcp_sk(sk)->ao_info = NULL; + rcu_assign_pointer(tcp_sk(sk)->ao_info, NULL); } if (!ao || !refcount_dec_and_test(&ao->refcnt)) return; - hlist_for_each_entry_safe(key, n, &ao->head, node) { - hlist_del_rcu(&key->node); - if (!twsk) - atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc); - call_rcu(&key->rcu, tcp_ao_key_free_rcu); - } - - kfree_rcu(ao, rcu); - static_branch_slow_dec_deferred(&tcp_ao_needed); + if (!twsk) + tcp_ao_sk_omem_free(sk, ao); + call_rcu(&ao->rcu, tcp_ao_info_free_rcu); } void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) From f6c29f710c1ff2590109f83be3e212b86c01e0f3 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 30 Jul 2024 07:19:41 -0700 Subject: [PATCH 0490/1052] i2c: smbus: Send alert notifications to all devices if source not found If a SMBus alert is received and the originating device is not found, the reason may be that the address reported on the SMBus alert address is corrupted, for example because multiple devices asserted alert and do not correctly implement SMBus arbitration. If this happens, call alert handlers on all devices connected to the given I2C bus, in the hope that this cleans up the situation. This change reliably fixed the problem on a system with multiple devices on a single bus. Example log where the device on address 0x18 (ADM1021) and on address 0x4c (ADT7461A) both had the alert line asserted: smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0 smbus_alert 3-000c: no driver alert()! smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0 smbus_alert 3-000c: no driver alert()! lm90 3-0018: temp1 out of range, please check! lm90 3-0018: Disabling ALERT# lm90 3-0029: Everything OK lm90 3-002a: Everything OK lm90 3-004c: temp1 out of range, please check! lm90 3-004c: temp2 out of range, please check! lm90 3-004c: Disabling ALERT# Fixes: b5527a7766f0 ("i2c: Add SMBus alert support") Signed-off-by: Guenter Roeck [wsa: fixed a typo in the commit message] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-smbus.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c index 836c247e7684..8256f7aed0cf 100644 --- a/drivers/i2c/i2c-smbus.c +++ b/drivers/i2c/i2c-smbus.c @@ -65,6 +65,32 @@ static int smbus_do_alert(struct device *dev, void *addrp) return ret; } +/* Same as above, but call back all drivers with alert handler */ + +static int smbus_do_alert_force(struct device *dev, void *addrp) +{ + struct i2c_client *client = i2c_verify_client(dev); + struct alert_data *data = addrp; + struct i2c_driver *driver; + + if (!client || (client->flags & I2C_CLIENT_TEN)) + return 0; + + /* + * Drivers should either disable alerts, or provide at least + * a minimal handler. Lock so the driver won't change. + */ + device_lock(dev); + if (client->dev.driver) { + driver = to_i2c_driver(client->dev.driver); + if (driver->alert) + driver->alert(client, data->type, data->data); + } + device_unlock(dev); + + return 0; +} + /* * The alert IRQ handler needs to hand work off to a task which can issue * SMBus calls, because those sleeping calls can't be made in IRQ context. @@ -106,13 +132,19 @@ static irqreturn_t smbus_alert(int irq, void *d) /* * If we read the same address more than once, and the alert * was not handled by a driver, it won't do any good to repeat - * the loop because it will never terminate. - * Bail out in this case. + * the loop because it will never terminate. Try again, this + * time calling the alert handlers of all devices connected to + * the bus, and abort the loop afterwards. If this helps, we + * are all set. If it doesn't, there is nothing else we can do, + * so we might as well abort the loop. * Note: This assumes that a driver with alert handler handles * the alert properly and clears it if necessary. */ - if (data.addr == prev_addr && status != -EBUSY) + if (data.addr == prev_addr && status != -EBUSY) { + device_for_each_child(&ara->adapter->dev, &data, + smbus_do_alert_force); break; + } prev_addr = data.addr; } From f17c06c6608ad4ecd2ccf321753fb511812d821b Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 2 Aug 2024 16:22:14 +0100 Subject: [PATCH 0491/1052] i2c: Fix conditional for substituting empty ACPI functions Add IS_ENABLED(CONFIG_I2C) to the conditional around a bunch of ACPI functions. The conditional around these functions depended only on CONFIG_ACPI. But the functions are implemented in I2C core, so are only present if CONFIG_I2C is enabled. Signed-off-by: Richard Fitzgerald Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 07e33bbc9256..7eedd0c662da 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1066,7 +1066,7 @@ static inline int of_i2c_get_board_info(struct device *dev, struct acpi_resource; struct acpi_resource_i2c_serialbus; -#if IS_ENABLED(CONFIG_ACPI) +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_I2C) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); From ff58838015c14a12cb8c003b9d6fc062b49e8d9e Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 30 Jul 2024 15:00:30 -0600 Subject: [PATCH 0492/1052] arm: dts: arm: versatile-ab: Fix duplicate clock node name Commit 04f08ef291d4 ("arm/arm64: dts: arm: Use generic clock and regulator nodenames") renamed nodes and created 2 "clock-24000000" nodes (at different paths). The kernel can't handle these duplicate names even though they are at different paths. Fix this by renaming one of the nodes to "clock-pclk". This name is aligned with other Arm boards (those didn't have a known frequency to use in the node name). Fixes: 04f08ef291d4 ("arm/arm64: dts: arm: Use generic clock and regulator nodenames") Reported-by: Guenter Roeck Signed-off-by: Rob Herring (Arm) Tested-by: Guenter Roeck Reviewed-by: Linus Walleij Tested-by: Linus Walleij Signed-off-by: Linus Torvalds --- arch/arm/boot/dts/arm/versatile-ab.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/arm/versatile-ab.dts b/arch/arm/boot/dts/arm/versatile-ab.dts index 6fe6b49f5d8e..635ab9268899 100644 --- a/arch/arm/boot/dts/arm/versatile-ab.dts +++ b/arch/arm/boot/dts/arm/versatile-ab.dts @@ -157,7 +157,7 @@ timclk: clock-1000000 { clocks = <&xtal24mhz>; }; - pclk: clock-24000000 { + pclk: clock-pclk { #clock-cells = <0>; compatible = "fixed-factor-clock"; clock-div = <1>; From b88f55389ad27f05ed84af9e1026aa64dbfabc9a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 4 Aug 2024 18:48:10 +0900 Subject: [PATCH 0493/1052] profiling: remove profile=sleep support The kernel sleep profile is no longer working due to a recursive locking bug introduced by commit 42a20f86dc19 ("sched: Add wrapper for get_wchan() to keep task blocked") Booting with the 'profile=sleep' kernel command line option added or executing # echo -n sleep > /sys/kernel/profiling after boot causes the system to lock up. Lockdep reports kthreadd/3 is trying to acquire lock: ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: get_wchan+0x32/0x70 but task is already holding lock: ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: try_to_wake_up+0x53/0x370 with the call trace being lock_acquire+0xc8/0x2f0 get_wchan+0x32/0x70 __update_stats_enqueue_sleeper+0x151/0x430 enqueue_entity+0x4b0/0x520 enqueue_task_fair+0x92/0x6b0 ttwu_do_activate+0x73/0x140 try_to_wake_up+0x213/0x370 swake_up_locked+0x20/0x50 complete+0x2f/0x40 kthread+0xfb/0x180 However, since nobody noticed this regression for more than two years, let's remove 'profile=sleep' support based on the assumption that nobody needs this functionality. Fixes: 42a20f86dc19 ("sched: Add wrapper for get_wchan() to keep task blocked") Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 4 +--- include/linux/profile.h | 1 - kernel/profile.c | 11 +---------- kernel/sched/stats.c | 10 ---------- 4 files changed, 2 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..09126bb8cc9f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4798,11 +4798,9 @@ profile= [KNL] Enable kernel profiling via /proc/profile Format: [,] - Param: : "schedule", "sleep", or "kvm" + Param: : "schedule" or "kvm" [defaults to kernel profiling] Param: "schedule" - profile schedule points. - Param: "sleep" - profile D-state sleeping (millisecs). - Requires CONFIG_SCHEDSTATS Param: "kvm" - profile VM exits. Param: - step/bucket size as a power of 2 for statistical time based profiling. diff --git a/include/linux/profile.h b/include/linux/profile.h index 2fb487f61d12..3f53cdb0c27c 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -10,7 +10,6 @@ #define CPU_PROFILING 1 #define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 #define KVM_PROFILING 4 struct proc_dir_entry; diff --git a/kernel/profile.c b/kernel/profile.c index ff68d3816182..1fcf1adcf4eb 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -50,20 +50,11 @@ EXPORT_SYMBOL_GPL(prof_on); int profile_setup(char *str) { static const char schedstr[] = "schedule"; - static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; const char *select = NULL; int par; - if (!strncmp(str, sleepstr, strlen(sleepstr))) { -#ifdef CONFIG_SCHEDSTATS - force_schedstat_enabled(); - prof_on = SLEEP_PROFILING; - select = sleepstr; -#else - pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); -#endif /* CONFIG_SCHEDSTATS */ - } else if (!strncmp(str, schedstr, strlen(schedstr))) { + if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 78e48f5426ee..eb0cdcd4d921 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -92,16 +92,6 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, trace_sched_stat_blocked(p, delta); - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(p), - delta >> 20); - } account_scheduler_latency(p, delta >> 10, 0); } } From de9c2c66ad8e787abec7c9d7eff4f8c3cdd28aed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 4 Aug 2024 13:50:53 -0700 Subject: [PATCH 0494/1052] Linux 6.11-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8ad55d6e7b60..44c02a6f60a1 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* From 0e8b53979ac86eddb3fd76264025a70071a25574 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 5 Aug 2024 14:01:21 +0900 Subject: [PATCH 0495/1052] bpf: kprobe: remove unused declaring of bpf_kprobe_override After the commit 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one"), "bpf_kprobe_override" is not used anywhere anymore, and we can remove it now. Link: https://lore.kernel.org/all/20240710085939.11520-1-dongml2@chinatelecom.cn/ Fixes: 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one") Signed-off-by: Menglong Dong Acked-by: Jiri Olsa Signed-off-by: Masami Hiramatsu (Google) --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..9435185c10ef 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -880,7 +880,6 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); -DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); From 8c8acb8f26cbde665b233dd1b9bbcbb9b86822dc Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 2 Aug 2024 22:53:15 +0900 Subject: [PATCH 0496/1052] kprobes: Fix to check symbol prefixes correctly Since str_has_prefix() takes the prefix as the 2nd argument and the string as the first, is_cfi_preamble_symbol() always fails to check the prefix. Fix the function parameter order so that it correctly check the prefix. Link: https://lore.kernel.org/all/172260679559.362040.7360872132937227206.stgit@devnote2/ Fixes: de02f2ac5d8c ("kprobes: Prohibit probing on CFI preamble symbol") Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e85de37d9e1e..da59c68df841 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1557,8 +1557,8 @@ static bool is_cfi_preamble_symbol(unsigned long addr) if (lookup_symbol_name(addr, symbuf)) return false; - return str_has_prefix("__cfi_", symbuf) || - str_has_prefix("__pfx_", symbuf); + return str_has_prefix(symbuf, "__cfi_") || + str_has_prefix(symbuf, "__pfx_"); } static int check_kprobe_address_safe(struct kprobe *p, From 34e1b1bb73244219b3b3e24911e56c6e7b2b679e Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 1 Aug 2024 14:31:39 +0000 Subject: [PATCH 0497/1052] ALSA: hda: cs35l56: Stop creating ALSA controls for firmware coefficients A number of laptops have gone to market with old firmware versions that export controls that have since been hidden, but we can't just install a newer firmware because the firmware for each product is customized and qualified by the OEM. The issue is that alsactl save and restore has no idea what controls are good to persist which can lead to misconfiguration. There is no reason that the UCM or user should need to interact with any of the ALSA controls for the firmware coefficients so they can be removed entirely, this also simplifies the driver. Signed-off-by: Simon Trimmer Link: https://patch.msgid.link/20240801143139.34549-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda.c | 38 +------------------------------------ sound/pci/hda/cs35l56_hda.h | 1 - 2 files changed, 1 insertion(+), 38 deletions(-) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 96d3f13c5abf..31cc92bac89a 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -559,18 +559,6 @@ static void cs35l56_hda_release_firmware_files(const struct firmware *wmfw_firmw kfree(coeff_filename); } -static void cs35l56_hda_create_dsp_controls_work(struct work_struct *work) -{ - struct cs35l56_hda *cs35l56 = container_of(work, struct cs35l56_hda, control_work); - struct hda_cs_dsp_ctl_info info; - - info.device_name = cs35l56->amp_name; - info.fw_type = HDA_CS_DSP_FW_MISC; - info.card = cs35l56->codec->card; - - hda_cs_dsp_add_controls(&cs35l56->cs_dsp, &info); -} - static void cs35l56_hda_apply_calibration(struct cs35l56_hda *cs35l56) { int ret; @@ -595,26 +583,15 @@ static void cs35l56_hda_fw_load(struct cs35l56_hda *cs35l56) char *wmfw_filename = NULL; unsigned int preloaded_fw_ver; bool firmware_missing; - bool add_dsp_controls_required = false; int ret; - /* - * control_work must be flushed before proceeding, but we can't do that - * here as it would create a deadlock on controls_rwsem so it must be - * performed before queuing dsp_work. - */ - WARN_ON_ONCE(work_busy(&cs35l56->control_work)); - /* * Prepare for a new DSP power-up. If the DSP has had firmware * downloaded previously then it needs to be powered down so that it - * can be updated and if hadn't been patched before then the controls - * will need to be added once firmware download succeeds. + * can be updated. */ if (cs35l56->base.fw_patched) cs_dsp_power_down(&cs35l56->cs_dsp); - else - add_dsp_controls_required = true; cs35l56->base.fw_patched = false; @@ -698,15 +675,6 @@ static void cs35l56_hda_fw_load(struct cs35l56_hda *cs35l56) CS35L56_FIRMWARE_MISSING); cs35l56->base.fw_patched = true; - /* - * Adding controls is deferred to prevent a lock inversion - ALSA takes - * the controls_rwsem when adding a control, the get() / put() - * functions of a control are called holding controls_rwsem and those - * that depend on running firmware wait for dsp_work() to complete. - */ - if (add_dsp_controls_required) - queue_work(system_long_wq, &cs35l56->control_work); - ret = cs_dsp_run(&cs35l56->cs_dsp); if (ret) dev_dbg(cs35l56->base.dev, "%s: cs_dsp_run ret %d\n", __func__, ret); @@ -753,7 +721,6 @@ static int cs35l56_hda_bind(struct device *dev, struct device *master, void *mas strscpy(comp->name, dev_name(dev), sizeof(comp->name)); comp->playback_hook = cs35l56_hda_playback_hook; - flush_work(&cs35l56->control_work); queue_work(system_long_wq, &cs35l56->dsp_work); cs35l56_hda_create_controls(cs35l56); @@ -775,7 +742,6 @@ static void cs35l56_hda_unbind(struct device *dev, struct device *master, void * struct hda_component *comp; cancel_work_sync(&cs35l56->dsp_work); - cancel_work_sync(&cs35l56->control_work); cs35l56_hda_remove_controls(cs35l56); @@ -806,7 +772,6 @@ static int cs35l56_hda_system_suspend(struct device *dev) struct cs35l56_hda *cs35l56 = dev_get_drvdata(dev); cs35l56_hda_wait_dsp_ready(cs35l56); - flush_work(&cs35l56->control_work); if (cs35l56->playing) cs35l56_hda_pause(cs35l56); @@ -1026,7 +991,6 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) dev_set_drvdata(cs35l56->base.dev, cs35l56); INIT_WORK(&cs35l56->dsp_work, cs35l56_hda_dsp_work); - INIT_WORK(&cs35l56->control_work, cs35l56_hda_create_dsp_controls_work); ret = cs35l56_hda_read_acpi(cs35l56, hid, id); if (ret) diff --git a/sound/pci/hda/cs35l56_hda.h b/sound/pci/hda/cs35l56_hda.h index c40d159507c2..38d94fb213a5 100644 --- a/sound/pci/hda/cs35l56_hda.h +++ b/sound/pci/hda/cs35l56_hda.h @@ -23,7 +23,6 @@ struct cs35l56_hda { struct cs35l56_base base; struct hda_codec *codec; struct work_struct dsp_work; - struct work_struct control_work; int index; const char *system_name; From 312c04cee408a8448ec8b639fe7f0434017d7161 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 1 Aug 2024 16:50:44 +0100 Subject: [PATCH 0498/1052] ALSA: hda: cs35l41: Stop creating ALSA Controls for firmware coefficients When the CS35L41 loads its firmware, it has a number of controls to affect its behaviour. Currently, these controls are exposed as ALSA Controls. These controls were never intended to be exposed to users but the firmware doesn't mark them hidden, so make the driver ignore them. Any changes in the coefficients handled by these controls needs to be matched to the individual system by SSID, which is already handled using the tuning file, when firmware is loaded, so UCM should not be setting these controls anyway. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20240801155047.456540-1-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 4b411ed8c3fe..3a92e98da72d 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -133,17 +133,6 @@ static const struct reg_sequence cs35l41_hda_mute[] = { { CS35L41_AMP_DIG_VOL_CTRL, 0x0000A678 }, // AMP_HPF_PCM_EN = 1, AMP_VOL_PCM Mute }; -static void cs35l41_add_controls(struct cs35l41_hda *cs35l41) -{ - struct hda_cs_dsp_ctl_info info; - - info.device_name = cs35l41->amp_name; - info.fw_type = cs35l41->firmware_type; - info.card = cs35l41->codec->card; - - hda_cs_dsp_add_controls(&cs35l41->cs_dsp, &info); -} - static const struct cs_dsp_client_ops client_ops = { .control_remove = hda_cs_dsp_control_remove, }; @@ -603,8 +592,6 @@ static int cs35l41_init_dsp(struct cs35l41_hda *cs35l41) if (ret) goto err; - cs35l41_add_controls(cs35l41); - cs35l41_hda_apply_calibration(cs35l41); err: From 38055789d15155109b41602ad719d770af507030 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 1 Aug 2024 18:04:07 +0300 Subject: [PATCH 0499/1052] wifi: ath12k: use 128 bytes aligned iova in transmit path for WCN7850 In transmit path, it is likely that the iova is not aligned to PCIe TLP max payload size, which is 128 for WCN7850. Normally in such cases hardware is expected to split the packet into several parts in a manner such that they, other than the first one, have aligned iova. However due to hardware limitations, WCN7850 does not behave like that properly with some specific unaligned iova in transmit path. This easily results in target hang in a KPI transmit test: packet send/receive failure, WMI command send timeout etc. Also fatal error seen in PCIe level: ... Capabilities: ... ... DevSta: ... FatalErr+ ... ... ... Work around this by manually moving/reallocating payload buffer such that we can map it to a 128 bytes aligned iova. The moving requires sufficient head room or tail room in skb: for the former we can do ourselves a favor by asking some extra bytes when registering with mac80211, while for the latter we can do nothing. Moving/reallocating buffer consumes additional CPU cycles, but the good news is that an aligned iova increases PCIe efficiency. In my tests on some X86 platforms the KPI results are almost consistent. Since this is seen only with WCN7850, add a new hardware parameter to differentiate from others. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Baochen Qiang Cc: Tested-by: Mark Pearson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240715023814.20242-1-quic_bqiang@quicinc.com --- drivers/net/wireless/ath/ath12k/dp_tx.c | 72 +++++++++++++++++++++++++ drivers/net/wireless/ath/ath12k/hw.c | 6 +++ drivers/net/wireless/ath/ath12k/hw.h | 4 ++ drivers/net/wireless/ath/ath12k/mac.c | 1 + 4 files changed, 83 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/dp_tx.c b/drivers/net/wireless/ath/ath12k/dp_tx.c index d08c04343e90..44406e0b4a34 100644 --- a/drivers/net/wireless/ath/ath12k/dp_tx.c +++ b/drivers/net/wireless/ath/ath12k/dp_tx.c @@ -162,6 +162,60 @@ static int ath12k_dp_prepare_htt_metadata(struct sk_buff *skb) return 0; } +static void ath12k_dp_tx_move_payload(struct sk_buff *skb, + unsigned long delta, + bool head) +{ + unsigned long len = skb->len; + + if (head) { + skb_push(skb, delta); + memmove(skb->data, skb->data + delta, len); + skb_trim(skb, len); + } else { + skb_put(skb, delta); + memmove(skb->data + delta, skb->data, len); + skb_pull(skb, delta); + } +} + +static int ath12k_dp_tx_align_payload(struct ath12k_base *ab, + struct sk_buff **pskb) +{ + u32 iova_mask = ab->hw_params->iova_mask; + unsigned long offset, delta1, delta2; + struct sk_buff *skb2, *skb = *pskb; + unsigned int headroom = skb_headroom(skb); + int tailroom = skb_tailroom(skb); + int ret = 0; + + offset = (unsigned long)skb->data & iova_mask; + delta1 = offset; + delta2 = iova_mask - offset + 1; + + if (headroom >= delta1) { + ath12k_dp_tx_move_payload(skb, delta1, true); + } else if (tailroom >= delta2) { + ath12k_dp_tx_move_payload(skb, delta2, false); + } else { + skb2 = skb_realloc_headroom(skb, iova_mask); + if (!skb2) { + ret = -ENOMEM; + goto out; + } + + dev_kfree_skb_any(skb); + + offset = (unsigned long)skb2->data & iova_mask; + if (offset) + ath12k_dp_tx_move_payload(skb2, offset, true); + *pskb = skb2; + } + +out: + return ret; +} + int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif, struct sk_buff *skb) { @@ -184,6 +238,7 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif, bool tcl_ring_retry; bool msdu_ext_desc = false; bool add_htt_metadata = false; + u32 iova_mask = ab->hw_params->iova_mask; if (test_bit(ATH12K_FLAG_CRASH_FLUSH, &ar->ab->dev_flags)) return -ESHUTDOWN; @@ -279,6 +334,23 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif, goto fail_remove_tx_buf; } + if (iova_mask && + (unsigned long)skb->data & iova_mask) { + ret = ath12k_dp_tx_align_payload(ab, &skb); + if (ret) { + ath12k_warn(ab, "failed to align TX buffer %d\n", ret); + /* don't bail out, give original buffer + * a chance even unaligned. + */ + goto map; + } + + /* hdr is pointing to a wrong place after alignment, + * so refresh it for later use. + */ + hdr = (void *)skb->data; + } +map: ti.paddr = dma_map_single(ab->dev, skb->data, skb->len, DMA_TO_DEVICE); if (dma_mapping_error(ab->dev, ti.paddr)) { atomic_inc(&ab->soc_stats.tx_err.misc_fail); diff --git a/drivers/net/wireless/ath/ath12k/hw.c b/drivers/net/wireless/ath/ath12k/hw.c index 2e11ea763574..7b0b6a7f4701 100644 --- a/drivers/net/wireless/ath/ath12k/hw.c +++ b/drivers/net/wireless/ath/ath12k/hw.c @@ -924,6 +924,8 @@ static const struct ath12k_hw_params ath12k_hw_params[] = { .acpi_guid = NULL, .supports_dynamic_smps_6ghz = true, + + .iova_mask = 0, }, { .name = "wcn7850 hw2.0", @@ -1000,6 +1002,8 @@ static const struct ath12k_hw_params ath12k_hw_params[] = { .acpi_guid = &wcn7850_uuid, .supports_dynamic_smps_6ghz = false, + + .iova_mask = ATH12K_PCIE_MAX_PAYLOAD_SIZE - 1, }, { .name = "qcn9274 hw2.0", @@ -1072,6 +1076,8 @@ static const struct ath12k_hw_params ath12k_hw_params[] = { .acpi_guid = NULL, .supports_dynamic_smps_6ghz = true, + + .iova_mask = 0, }, }; diff --git a/drivers/net/wireless/ath/ath12k/hw.h b/drivers/net/wireless/ath/ath12k/hw.h index e792eb6b249b..b1d302c48326 100644 --- a/drivers/net/wireless/ath/ath12k/hw.h +++ b/drivers/net/wireless/ath/ath12k/hw.h @@ -96,6 +96,8 @@ #define ATH12K_M3_FILE "m3.bin" #define ATH12K_REGDB_FILE_NAME "regdb.bin" +#define ATH12K_PCIE_MAX_PAYLOAD_SIZE 128 + enum ath12k_hw_rate_cck { ATH12K_HW_RATE_CCK_LP_11M = 0, ATH12K_HW_RATE_CCK_LP_5_5M, @@ -215,6 +217,8 @@ struct ath12k_hw_params { const guid_t *acpi_guid; bool supports_dynamic_smps_6ghz; + + u32 iova_mask; }; struct ath12k_hw_ops { diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 8106297f0bc1..ce41c8153080 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -9193,6 +9193,7 @@ static int ath12k_mac_hw_register(struct ath12k_hw *ah) hw->vif_data_size = sizeof(struct ath12k_vif); hw->sta_data_size = sizeof(struct ath12k_sta); + hw->extra_tx_headroom = ab->hw_params->iova_mask; wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_STA_TX_PWR); From 1b85bdb0fadb42f5ef75ddcd259fc1ef13ec04de Mon Sep 17 00:00:00 2001 From: Dnyaneshwar Bhadane Date: Thu, 1 Aug 2024 16:41:41 +0530 Subject: [PATCH 0500/1052] drm/i915/display: correct dual pps handling for MTL_PCH+ On the PCH side the second PPS was introduced in ICP+.Add condition On MTL_PCH and greater platform also having the second PPS. Note that DG1/2 south block only has the single PPS, so need to exclude the fake DG1/2 PCHs Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11488 Fixes: 93cbc1accbce ("drm/i915/mtl: Add fake PCH for Meteor Lake") Cc: # v6.9+ Signed-off-by: Dnyaneshwar Bhadane Reviewed-by: Jani Nikula Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240801111141.574854-1-dnyaneshwar.bhadane@intel.com (cherry picked from commit da1878b61c8d480c361ba6a39ce8a31c80b65826) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_backlight.c | 3 +++ drivers/gpu/drm/i915/display/intel_pps.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_backlight.c b/drivers/gpu/drm/i915/display/intel_backlight.c index 071668bfe5d1..6c3333136737 100644 --- a/drivers/gpu/drm/i915/display/intel_backlight.c +++ b/drivers/gpu/drm/i915/display/intel_backlight.c @@ -1449,6 +1449,9 @@ bxt_setup_backlight(struct intel_connector *connector, enum pipe unused) static int cnp_num_backlight_controllers(struct drm_i915_private *i915) { + if (INTEL_PCH_TYPE(i915) >= PCH_MTL) + return 2; + if (INTEL_PCH_TYPE(i915) >= PCH_DG1) return 1; diff --git a/drivers/gpu/drm/i915/display/intel_pps.c b/drivers/gpu/drm/i915/display/intel_pps.c index 42306bc4ba86..7ce926241e83 100644 --- a/drivers/gpu/drm/i915/display/intel_pps.c +++ b/drivers/gpu/drm/i915/display/intel_pps.c @@ -351,6 +351,9 @@ static int intel_num_pps(struct drm_i915_private *i915) if (IS_GEMINILAKE(i915) || IS_BROXTON(i915)) return 2; + if (INTEL_PCH_TYPE(i915) >= PCH_MTL) + return 2; + if (INTEL_PCH_TYPE(i915) >= PCH_DG1) return 1; From b50f2af9fbc5c00103ca8b72752b15310bd77762 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Thu, 1 Aug 2024 21:23:37 +0800 Subject: [PATCH 0501/1052] virtio-net: check feature before configuring the vq coalescing command Virtio spec says: The driver MUST have negotiated the VIRTIO_NET_F_VQ_NOTF_COAL feature when issuing commands VIRTIO_NET_CTRL_NOTF_COAL_VQ_SET and VIRTIO_NET_CTRL_NOTF_COAL_VQ_GET. So we add the feature negotiation check to virtnet_send_{r,t}x_ctrl_coal_vq_cmd as a basis for the next bugfix patch. Suggested-by: Michael S. Tsirkin Signed-off-by: Heng Qi Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0383a3e136d6..b1176be8fcfd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3658,6 +3658,9 @@ static int virtnet_send_rx_ctrl_coal_vq_cmd(struct virtnet_info *vi, { int err; + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return -EOPNOTSUPP; + err = virtnet_send_ctrl_coal_vq_cmd(vi, rxq2vq(queue), max_usecs, max_packets); if (err) @@ -3675,6 +3678,9 @@ static int virtnet_send_tx_ctrl_coal_vq_cmd(struct virtnet_info *vi, { int err; + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return -EOPNOTSUPP; + err = virtnet_send_ctrl_coal_vq_cmd(vi, txq2vq(queue), max_usecs, max_packets); if (err) From 4ba8d97083707409822264fd1776aad7233f353e Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Thu, 1 Aug 2024 21:23:38 +0800 Subject: [PATCH 0502/1052] virtio-net: unbreak vq resizing when coalescing is not negotiated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't break the resize action if the vq coalescing feature named VIRTIO_NET_F_VQ_NOTF_COAL is not negotiated. Fixes: f61fe5f081cf ("virtio-net: fix the vq coalescing setting for vq resize") Signed-off-by: Heng Qi Reviewed-by: Xuan Zhuo Acked-by: Eugenio Pé rez Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b1176be8fcfd..3f10c72743e9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3749,7 +3749,11 @@ static int virtnet_set_ringparam(struct net_device *dev, err = virtnet_send_tx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_tx.max_usecs, vi->intr_coal_tx.max_packets); - if (err) + + /* Don't break the tx resize action if the vq coalescing is not + * supported. The same is true for rx resize below. + */ + if (err && err != -EOPNOTSUPP) return err; } @@ -3764,7 +3768,7 @@ static int virtnet_set_ringparam(struct net_device *dev, vi->intr_coal_rx.max_usecs, vi->intr_coal_rx.max_packets); mutex_unlock(&vi->rq[i].dim_lock); - if (err) + if (err && err != -EOPNOTSUPP) return err; } } From 7ab107544b777c3bd7feb9fe447367d8edd5b202 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Thu, 1 Aug 2024 15:55:12 +0200 Subject: [PATCH 0503/1052] net: usb: qmi_wwan: fix memory leak for not ip packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Free the unused skb when not ip packets arrive. Fixes: c6adf77953bc ("net: usb: qmi_wwan: add qmap mux protocol support") Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 386d62769ded..cfda32047cff 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -201,6 +201,7 @@ static int qmimux_rx_fixup(struct usbnet *dev, struct sk_buff *skb) break; default: /* not ip - do not know what to do */ + kfree_skb(skbn); goto skip; } From 45b4acab4cac79503663f0a4be9eb3752db04d4b Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 5 Aug 2024 10:27:20 +0000 Subject: [PATCH 0504/1052] ASoC: wm_adsp: Add control_add callback and export wm_adsp_control_add() The callback allows codec drivers to affect how firmware coefficients are added as controls. For example a codec driver may selectively add controls by choosing to call wm_adsp_control_add() based on some filter logic. Signed-off-by: Simon Trimmer Link: https://patch.msgid.link/20240805102721.30102-2-simont@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm_adsp.c | 17 ++++++++++++++--- sound/soc/codecs/wm_adsp.h | 3 +++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 9f8549b34e30..e69283195f36 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -583,7 +583,7 @@ static void wm_adsp_ctl_work(struct work_struct *work) kfree(kcontrol); } -static int wm_adsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl) +int wm_adsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl) { struct wm_adsp *dsp = container_of(cs_ctl->dsp, struct wm_adsp, cs_dsp); struct cs_dsp *cs_dsp = &dsp->cs_dsp; @@ -658,6 +658,17 @@ static int wm_adsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl) return ret; } +EXPORT_SYMBOL_GPL(wm_adsp_control_add); + +static int wm_adsp_control_add_cb(struct cs_dsp_coeff_ctl *cs_ctl) +{ + struct wm_adsp *dsp = container_of(cs_ctl->dsp, struct wm_adsp, cs_dsp); + + if (dsp->control_add) + return (dsp->control_add)(dsp, cs_ctl); + else + return wm_adsp_control_add(cs_ctl); +} static void wm_adsp_control_remove(struct cs_dsp_coeff_ctl *cs_ctl) { @@ -2072,12 +2083,12 @@ irqreturn_t wm_halo_wdt_expire(int irq, void *data) EXPORT_SYMBOL_GPL(wm_halo_wdt_expire); static const struct cs_dsp_client_ops wm_adsp1_client_ops = { - .control_add = wm_adsp_control_add, + .control_add = wm_adsp_control_add_cb, .control_remove = wm_adsp_control_remove, }; static const struct cs_dsp_client_ops wm_adsp2_client_ops = { - .control_add = wm_adsp_control_add, + .control_add = wm_adsp_control_add_cb, .control_remove = wm_adsp_control_remove, .pre_run = wm_adsp_pre_run, .post_run = wm_adsp_event_post_run, diff --git a/sound/soc/codecs/wm_adsp.h b/sound/soc/codecs/wm_adsp.h index e53dfcf1f78f..edc5b02ae765 100644 --- a/sound/soc/codecs/wm_adsp.h +++ b/sound/soc/codecs/wm_adsp.h @@ -37,6 +37,7 @@ struct wm_adsp { bool wmfw_optional; struct work_struct boot_work; + int (*control_add)(struct wm_adsp *dsp, struct cs_dsp_coeff_ctl *cs_ctl); int (*pre_run)(struct wm_adsp *dsp); bool preloaded; @@ -132,6 +133,8 @@ int wm_adsp_compr_pointer(struct snd_soc_component *component, int wm_adsp_compr_copy(struct snd_soc_component *component, struct snd_compr_stream *stream, char __user *buf, size_t count); + +int wm_adsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl); int wm_adsp_write_ctl(struct wm_adsp *dsp, const char *name, int type, unsigned int alg, void *buf, size_t len); int wm_adsp_read_ctl(struct wm_adsp *dsp, const char *name, int type, From 2c3640b82213cf2beb7c1cc3cfce2ecf5349b0de Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 5 Aug 2024 10:27:21 +0000 Subject: [PATCH 0505/1052] ASoC: cs35l56: Stop creating ALSA controls for firmware coefficients A number of laptops have gone to market with old firmware versions that export controls that have since been hidden, but we can't just install a newer firmware because the firmware for each product is customized and qualified by the OEM. The issue is that alsactl save and restore has no idea what controls are good to persist which can lead to misconfiguration. There is no reason that the UCM or user should need to interact with any of the ALSA controls for the firmware coefficients so they can be removed entirely. Fixes: e49611252900 ("ASoC: cs35l56: Add driver for Cirrus Logic CS35L56") Signed-off-by: Simon Trimmer Link: https://patch.msgid.link/20240805102721.30102-3-simont@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c index 84c34f5b1a51..757ade6373ed 100644 --- a/sound/soc/codecs/cs35l56.c +++ b/sound/soc/codecs/cs35l56.c @@ -1095,6 +1095,11 @@ int cs35l56_system_resume(struct device *dev) } EXPORT_SYMBOL_GPL(cs35l56_system_resume); +static int cs35l56_control_add_nop(struct wm_adsp *dsp, struct cs_dsp_coeff_ctl *cs_ctl) +{ + return 0; +} + static int cs35l56_dsp_init(struct cs35l56_private *cs35l56) { struct wm_adsp *dsp; @@ -1117,6 +1122,12 @@ static int cs35l56_dsp_init(struct cs35l56_private *cs35l56) dsp->fw = 12; dsp->wmfw_optional = true; + /* + * None of the firmware controls need to be exported so add a no-op + * callback that suppresses creating an ALSA control. + */ + dsp->control_add = &cs35l56_control_add_nop; + dev_dbg(cs35l56->base.dev, "DSP system name: '%s'\n", dsp->system_name); ret = wm_halo_init(dsp); From dc268085e499666b9f4f0fcb4c5a94e1c0b193b3 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 5 Aug 2024 12:42:22 +0100 Subject: [PATCH 0506/1052] ASoC: cs-amp-lib: Fix NULL pointer crash if efi.get_variable is NULL Call efi_rt_services_supported() to check that efi.get_variable exists before calling it. Signed-off-by: Richard Fitzgerald Fixes: 1cad8725f2b9 ("ASoC: cs-amp-lib: Add helpers for factory calibration data") Link: https://patch.msgid.link/20240805114222.15722-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 287ac01a3873..605964af8afa 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -108,7 +108,7 @@ static efi_status_t cs_amp_get_efi_variable(efi_char16_t *name, KUNIT_STATIC_STUB_REDIRECT(cs_amp_get_efi_variable, name, guid, size, buf); - if (IS_ENABLED(CONFIG_EFI)) + if (efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) return efi.get_variable(name, guid, &attr, size, buf); return EFI_NOT_FOUND; From ce4a995884ecedb98ba00e2e0b8ce94cde2060ce Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 11:59:31 +0200 Subject: [PATCH 0507/1052] drm/omap: add CONFIG_MMU dependency Compile-testing with CONFIG_MMU disabled causes a link error in omapdrm: arm-linux-gnueabi-ld: drivers/gpu/drm/omapdrm/omap_gem.o: in function `omap_gem_fault_2d': omap_gem.c:(.text+0x36e): undefined reference to `vmf_insert_mixed' arm-linux-gnueabi-ld: drivers/gpu/drm/omapdrm/omap_gem.o: in function `omap_gem_fault': omap_gem.c:(.text+0x74a): undefined reference to `vmf_insert_mixed' Avoid this by adding a Kconfig dependency. Fixes: dc6fcaaba5a5 ("drm/omap: Allow build with COMPILE_TEST=y") Signed-off-by: Arnd Bergmann Signed-off-by: Tomi Valkeinen Link: https://patchwork.freedesktop.org/patch/msgid/20240719095942.3841009-1-arnd@kernel.org --- drivers/gpu/drm/omapdrm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/omapdrm/Kconfig b/drivers/gpu/drm/omapdrm/Kconfig index 3f7139e211d2..64e440a2649b 100644 --- a/drivers/gpu/drm/omapdrm/Kconfig +++ b/drivers/gpu/drm/omapdrm/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config DRM_OMAP tristate "OMAP DRM" + depends on MMU depends on DRM && OF depends on ARCH_OMAP2PLUS || (COMPILE_TEST && PAGE_SIZE_LESS_THAN_64KB) select DRM_KMS_HELPER From 15b7a03205b31bc5623378c190d22b7ff60026f1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 5 Aug 2024 15:01:28 +0200 Subject: [PATCH 0508/1052] ALSA: line6: Fix racy access to midibuf There can be concurrent accesses to line6 midibuf from both the URB completion callback and the rawmidi API access. This could be a cause of KMSAN warning triggered by syzkaller below (so put as reported-by here). This patch protects the midibuf call of the former code path with a spinlock for avoiding the possible races. Reported-by: syzbot+78eccfb8b3c9a85fc6c5@syzkaller.appspotmail.com Closes: https://lore.kernel.org/00000000000000949c061df288c5@google.com Cc: Link: https://patch.msgid.link/20240805130129.10872-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/line6/driver.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c index f4437015d43a..9df49a880b75 100644 --- a/sound/usb/line6/driver.c +++ b/sound/usb/line6/driver.c @@ -286,12 +286,14 @@ static void line6_data_received(struct urb *urb) { struct usb_line6 *line6 = (struct usb_line6 *)urb->context; struct midi_buffer *mb = &line6->line6midi->midibuf_in; + unsigned long flags; int done; if (urb->status == -ESHUTDOWN) return; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { + spin_lock_irqsave(&line6->line6midi->lock, flags); done = line6_midibuf_write(mb, urb->transfer_buffer, urb->actual_length); @@ -300,12 +302,15 @@ static void line6_data_received(struct urb *urb) dev_dbg(line6->ifcdev, "%d %d buffer overflow - message skipped\n", done, urb->actual_length); } + spin_unlock_irqrestore(&line6->line6midi->lock, flags); for (;;) { + spin_lock_irqsave(&line6->line6midi->lock, flags); done = line6_midibuf_read(mb, line6->buffer_message, LINE6_MIDI_MESSAGE_MAXLEN, LINE6_MIDIBUF_READ_RX); + spin_unlock_irqrestore(&line6->line6midi->lock, flags); if (done <= 0) break; From 042b8711a0beafb2c3b888bebe3c300ab4c817fa Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 18 Jul 2024 10:24:10 +0200 Subject: [PATCH 0509/1052] drm/mediatek: Set sensible cursor width/height values to fix crash Hardware-speaking, there is no feature-reduced cursor specific plane, so this driver reserves the last all Overlay plane as a Cursor plane, but sets the maximum cursor width/height to the maximum value that the full overlay plane can use. While this could be ok, it raises issues with common userspace using libdrm (especially Mutter, but other compositors too) which will crash upon performing allocations and/or using said cursor plane. Reduce the maximum width/height for the cursor to 512x512 pixels, value taken from IGT's maximum cursor size test, which succeeds. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Fei Shao Tested-by: Fei Shao Reviewed-by: Daniel Stone Reviewed-by: CK Hu Link: https://patchwork.kernel.org/project/dri-devel/patch/20240718082410.204459-1-angelogioacchino.delregno@collabora.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index ae5c6ec24a1e..77b50c56c124 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -539,8 +539,8 @@ static int mtk_drm_kms_init(struct drm_device *drm) } /* IGT will check if the cursor size is configured */ - drm->mode_config.cursor_width = drm->mode_config.max_width; - drm->mode_config.cursor_height = drm->mode_config.max_height; + drm->mode_config.cursor_width = 512; + drm->mode_config.cursor_height = 512; /* Use OVL device for all DMA memory allocations */ crtc = drm_crtc_from_index(drm, 0); From 9438f970296f9c3a6dd340ae0ad01d2f056c88e6 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Wed, 31 Jul 2024 07:48:04 +0200 Subject: [PATCH 0510/1052] arm64: dts: ti: k3-am62-verdin-dahlia: Keep CTRL_SLEEP_MOCI# regulator on This reverts commit 3935fbc87ddebea5439f3ab6a78b1e83e976bf88. CTRL_SLEEP_MOCI# is a signal that is defined for all the SoM implementing the Verdin family specification, this signal is supposed to control the power enable in the carrier board when the system is in deep sleep mode. However this is not possible with Texas Instruments AM62 SoC, IOs output buffer is disabled in deep sleep and IOs are in tri-state mode. Given that we cannot properly control this pin, force it to be always high to minimize potential issues. Fixes: 3935fbc87dde ("arm64: dts: ti: k3-am62-verdin-dahlia: support sleep-moci") Cc: Link: https://e2e.ti.com/support/processors-group/processors/f/processors-forum/1361669/am625-gpio-output-state-in-deep-sleep/5244802 Signed-off-by: Francesco Dolcini Link: https://lore.kernel.org/r/20240731054804.6061-1-francesco@dolcini.it Signed-off-by: Nishanth Menon --- .../boot/dts/ti/k3-am62-verdin-dahlia.dtsi | 22 ------------------- arch/arm64/boot/dts/ti/k3-am62-verdin.dtsi | 6 ----- 2 files changed, 28 deletions(-) diff --git a/arch/arm64/boot/dts/ti/k3-am62-verdin-dahlia.dtsi b/arch/arm64/boot/dts/ti/k3-am62-verdin-dahlia.dtsi index e8f4d136e5df..9202181fbd65 100644 --- a/arch/arm64/boot/dts/ti/k3-am62-verdin-dahlia.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am62-verdin-dahlia.dtsi @@ -43,15 +43,6 @@ simple-audio-card,cpu { sound-dai = <&mcasp0>; }; }; - - reg_usb_hub: regulator-usb-hub { - compatible = "regulator-fixed"; - enable-active-high; - /* Verdin CTRL_SLEEP_MOCI# (SODIMM 256) */ - gpio = <&main_gpio0 31 GPIO_ACTIVE_HIGH>; - regulator-boot-on; - regulator-name = "HUB_PWR_EN"; - }; }; /* Verdin ETHs */ @@ -193,11 +184,6 @@ &ospi0 { status = "okay"; }; -/* Do not force CTRL_SLEEP_MOCI# always enabled */ -®_force_sleep_moci { - status = "disabled"; -}; - /* Verdin SD_1 */ &sdhci1 { status = "okay"; @@ -218,15 +204,7 @@ &usbss1 { }; &usb1 { - #address-cells = <1>; - #size-cells = <0>; status = "okay"; - - usb-hub@1 { - compatible = "usb424,2744"; - reg = <1>; - vdd-supply = <®_usb_hub>; - }; }; /* Verdin CTRL_WAKE1_MICO# */ diff --git a/arch/arm64/boot/dts/ti/k3-am62-verdin.dtsi b/arch/arm64/boot/dts/ti/k3-am62-verdin.dtsi index 359f53f3e019..5bef31b8577b 100644 --- a/arch/arm64/boot/dts/ti/k3-am62-verdin.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am62-verdin.dtsi @@ -138,12 +138,6 @@ reg_1v8_eth: regulator-1v8-eth { vin-supply = <®_1v8>; }; - /* - * By default we enable CTRL_SLEEP_MOCI#, this is required to have - * peripherals on the carrier board powered. - * If more granularity or power saving is required this can be disabled - * in the carrier board device tree files. - */ reg_force_sleep_moci: regulator-force-sleep-moci { compatible = "regulator-fixed"; enable-active-high; From 87d571d6fb77ec342a985afa8744bb9bb75b3622 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 17 May 2024 20:22:44 +0000 Subject: [PATCH 0511/1052] ntp: Clamp maxerror and esterror to operating range Using syzkaller alongside the newly reintroduced signed integer overflow sanitizer spits out this report: UBSAN: signed-integer-overflow in ../kernel/time/ntp.c:461:16 9223372036854775807 + 500 cannot be represented in type 'long' Call Trace: handle_overflow+0x171/0x1b0 second_overflow+0x2d6/0x500 accumulate_nsecs_to_secs+0x60/0x160 timekeeping_advance+0x1fe/0x890 update_wall_time+0x10/0x30 time_maxerror is unconditionally incremented and the result is checked against NTP_PHASE_LIMIT, but the increment itself can overflow, resulting in wrap-around to negative space. Before commit eea83d896e31 ("ntp: NTP4 user space bits update") the user supplied value was sanity checked to be in the operating range. That change removed the sanity check and relied on clamping in handle_overflow() which does not work correctly when the user supplied value is in the overflow zone of the '+ 500' operation. The operation requires CAP_SYS_TIME and the side effect of the overflow is NTP getting out of sync. Miroslav confirmed that the input value should be clamped to the operating range and the same applies to time_esterror. The latter is not used by the kernel, but the value still should be in the operating range as it was before the sanity check got removed. Clamp them to the operating range. [ tglx: Changed it to clamping and included time_esterror ] Fixes: eea83d896e31 ("ntp: NTP4 user space bits update") Signed-off-by: Justin Stitt Signed-off-by: Thomas Gleixner Cc: Miroslav Lichvar Link: https://lore.kernel.org/all/20240517-b4-sio-ntp-usec-v2-1-d539180f2b79@google.com Closes: https://github.com/KSPP/linux/issues/354 --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 406dccb79c2b..502e1e5b7f7f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -727,10 +727,10 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, } if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; + time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; + time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { time_constant = txc->constant; From 06c03c8edce333b9ad9c6b207d93d3a5ae7c10c0 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 17 May 2024 00:47:10 +0000 Subject: [PATCH 0512/1052] ntp: Safeguard against time_constant overflow Using syzkaller with the recently reintroduced signed integer overflow sanitizer produces this UBSAN report: UBSAN: signed-integer-overflow in ../kernel/time/ntp.c:738:18 9223372036854775806 + 4 cannot be represented in type 'long' Call Trace: handle_overflow+0x171/0x1b0 __do_adjtimex+0x1236/0x1440 do_adjtimex+0x2be/0x740 The user supplied time_constant value is incremented by four and then clamped to the operating range. Before commit eea83d896e31 ("ntp: NTP4 user space bits update") the user supplied value was sanity checked to be in the operating range. That change removed the sanity check and relied on clamping after incrementing which does not work correctly when the user supplied value is in the overflow zone of the '+ 4' operation. The operation requires CAP_SYS_TIME and the side effect of the overflow is NTP getting out of sync. Similar to the fixups for time_maxerror and time_esterror, clamp the user space supplied value to the operating range. [ tglx: Switch to clamping ] Fixes: eea83d896e31 ("ntp: NTP4 user space bits update") Signed-off-by: Justin Stitt Signed-off-by: Thomas Gleixner Cc: Miroslav Lichvar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240517-b4-sio-ntp-c-v2-1-f3a80096f36f@google.com Closes: https://github.com/KSPP/linux/issues/352 --- kernel/time/ntp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 502e1e5b7f7f..8d2dd214ec68 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -733,11 +733,10 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; + time_constant = clamp(txc->constant, 0, MAXTC); if (!(time_status & STA_NANO)) time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); + time_constant = clamp(time_constant, 0, MAXTC); } if (txc->modes & ADJ_TAI && From 5916be8a53de6401871bdd953f6c60237b47d6d3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 3 Aug 2024 17:07:51 +0200 Subject: [PATCH 0513/1052] timekeeping: Fix bogus clock_was_set() invocation in do_adjtimex() The addition of the bases argument to clock_was_set() fixed up all call sites correctly except for do_adjtimex(). This uses CLOCK_REALTIME instead of CLOCK_SET_WALL as argument. CLOCK_REALTIME is 0. As a result the effect of that clock_was_set() notification is incomplete and might result in timers expiring late because the hrtimer code does not re-evaluate the affected clock bases. Use CLOCK_SET_WALL instead of CLOCK_REALTIME to tell the hrtimers code which clock bases need to be re-evaluated. Fixes: 17a1b8826b45 ("hrtimer: Add bases argument to clock_was_set()") Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/877ccx7igo.ffs@tglx --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2fa87dcfeda9..5391e4167d60 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2606,7 +2606,7 @@ int do_adjtimex(struct __kernel_timex *txc) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) - clock_was_set(CLOCK_REALTIME); + clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(); From 1fb0847392e220890c9cf8908e3ab8e7e1227ff6 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Thu, 11 Jul 2024 14:26:55 +0300 Subject: [PATCH 0514/1052] drm/bridge-connector: Fix double free in error handling paths The recent switch to drmm allocation in drm_bridge_connector_init() may cause double free on bridge_connector in some of the error handling paths. Drop the explicit kfree() calls on bridge_connector. Fixes: c12907be57b1 ("drm/bridge-connector: switch to using drmm allocations") Signed-off-by: Cristian Ciocaltea Signed-off-by: default avatarRobert Foss Reviewed-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240711-bridge-connector-fix-dbl-free-v1-1-d558b2d0eb93@collabora.com --- drivers/gpu/drm/drm_bridge_connector.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/drm_bridge_connector.c b/drivers/gpu/drm/drm_bridge_connector.c index 0869b663f17e..a4fbf1eb7ac5 100644 --- a/drivers/gpu/drm/drm_bridge_connector.c +++ b/drivers/gpu/drm/drm_bridge_connector.c @@ -443,10 +443,8 @@ struct drm_connector *drm_bridge_connector_init(struct drm_device *drm, panel_bridge = bridge; } - if (connector_type == DRM_MODE_CONNECTOR_Unknown) { - kfree(bridge_connector); + if (connector_type == DRM_MODE_CONNECTOR_Unknown) return ERR_PTR(-EINVAL); - } if (bridge_connector->bridge_hdmi) ret = drmm_connector_hdmi_init(drm, connector, @@ -461,10 +459,8 @@ struct drm_connector *drm_bridge_connector_init(struct drm_device *drm, ret = drmm_connector_init(drm, connector, &drm_bridge_connector_funcs, connector_type, ddc); - if (ret) { - kfree(bridge_connector); + if (ret) return ERR_PTR(ret); - } drm_connector_helper_add(connector, &drm_bridge_connector_helper_funcs); From b93d16bee557302d4e588375ececd833cc048acc Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Sat, 3 Aug 2024 14:10:41 +0800 Subject: [PATCH 0515/1052] i2c: qcom-geni: Add missing clk_disable_unprepare in geni_i2c_runtime_resume Add the missing clk_disable_unprepare() before return in geni_i2c_runtime_resume(). Fixes: 14d02fbadb5d ("i2c: qcom-geni: add desc struct to prepare support for I2C Master Hub variant") Signed-off-by: Gaosheng Cui Reviewed-by: Vladimir Zapolskiy Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-qcom-geni.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 0a8b95ce35f7..78f43648e9f3 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -990,8 +990,10 @@ static int __maybe_unused geni_i2c_runtime_resume(struct device *dev) return ret; ret = geni_se_resources_on(&gi2c->se); - if (ret) + if (ret) { + clk_disable_unprepare(gi2c->core_clk); return ret; + } enable_irq(gi2c->irq); gi2c->suspended = 0; From 9a1af1e218779724ff29ca75f2b9397dc3ed11e7 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 29 Jul 2024 15:13:51 +0200 Subject: [PATCH 0516/1052] ASoC: codecs: lpass-macro: fix missing codec version Recent changes that started checking the codec version broke audio on the Lenovo ThinkPad X13s: wsa_macro 3240000.codec: Unsupported Codec version (0) wsa_macro 3240000.codec: probe with driver wsa_macro failed with error -22 rx_macro 3200000.rxmacro: Unsupported Codec version (0) rx_macro 3200000.rxmacro: probe with driver rx_macro failed with error -22 Add the missing codec version to the lookup table so that the codec drivers probe successfully. Note that I'm just assuming that this is a 2.0 codec based on the fact that this device uses the older register layout. Fixes: 378918d59181 ("ASoC: codecs: lpass-macro: add helpers to get codec version") Fixes: dbacef05898d ("ASoC: codec: lpass-rx-macro: prepare driver to accomdate new codec versions") Fixes: 727de4fbc546 ("ASoC: codecs: lpass-wsa-macro: Correct support for newer v2.5 version") Signed-off-by: Johan Hovold Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240729131351.27886-1-johan+linaro@kernel.org Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-va-macro.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/lpass-va-macro.c b/sound/soc/codecs/lpass-va-macro.c index b852cc7ffad9..a62ccd09bacd 100644 --- a/sound/soc/codecs/lpass-va-macro.c +++ b/sound/soc/codecs/lpass-va-macro.c @@ -1472,6 +1472,8 @@ static void va_macro_set_lpass_codec_version(struct va_macro *va) if ((core_id_0 == 0x01) && (core_id_1 == 0x0F)) version = LPASS_CODEC_VERSION_2_0; + if ((core_id_0 == 0x02) && (core_id_1 == 0x0F) && core_id_2 == 0x01) + version = LPASS_CODEC_VERSION_2_0; if ((core_id_0 == 0x02) && (core_id_1 == 0x0E)) version = LPASS_CODEC_VERSION_2_1; if ((core_id_0 == 0x02) && (core_id_1 == 0x0F) && (core_id_2 == 0x50 || core_id_2 == 0x51)) From e42066df07c0fcedebb32ed56f8bc39b4bf86337 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 5 Aug 2024 15:08:39 +0100 Subject: [PATCH 0517/1052] ASoC: cs35l56: Handle OTP read latency over SoundWire Use the late-read buffer in the CS35L56 SoundWire interface to read OTP memory. The OTP memory has a longer access latency than chip registers and cannot guarantee to return the data value in the SoundWire control response if the bus clock is >4.8 MHz. The Cirrus SoundWire peripheral IP exposes the bridge-to-bus read buffer and status bits. For a read from OTP the bridge status bits are polled to wait for the OTP data to be loaded into the read buffer and the data is then read from there. Signed-off-by: Richard Fitzgerald Fixes: e1830f66f6c6 ("ASoC: cs35l56: Add helper functions for amp calibration") Link: https://patch.msgid.link/20240805140839.26042-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 5 +++ sound/soc/codecs/cs35l56-sdw.c | 77 ++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index a6aa112e5741..a51acefa785f 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -277,6 +277,11 @@ static inline int cs35l56_force_sync_asp1_registers_from_cache(struct cs35l56_ba return 0; } +static inline bool cs35l56_is_otp_register(unsigned int reg) +{ + return (reg >> 16) == 3; +} + extern struct regmap_config cs35l56_regmap_i2c; extern struct regmap_config cs35l56_regmap_spi; extern struct regmap_config cs35l56_regmap_sdw; diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index fc03bb7ecae1..7c9a17fe2195 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -23,6 +23,79 @@ /* Register addresses are offset when sent over SoundWire */ #define CS35L56_SDW_ADDR_OFFSET 0x8000 +/* Cirrus bus bridge registers */ +#define CS35L56_SDW_MEM_ACCESS_STATUS 0xd0 +#define CS35L56_SDW_MEM_READ_DATA 0xd8 + +#define CS35L56_SDW_LAST_LATE BIT(3) +#define CS35L56_SDW_CMD_IN_PROGRESS BIT(2) +#define CS35L56_SDW_RDATA_RDY BIT(0) + +#define CS35L56_LATE_READ_POLL_US 10 +#define CS35L56_LATE_READ_TIMEOUT_US 1000 + +static int cs35l56_sdw_poll_mem_status(struct sdw_slave *peripheral, + unsigned int mask, + unsigned int match) +{ + int ret, val; + + ret = read_poll_timeout(sdw_read_no_pm, val, + (val < 0) || ((val & mask) == match), + CS35L56_LATE_READ_POLL_US, CS35L56_LATE_READ_TIMEOUT_US, + false, peripheral, CS35L56_SDW_MEM_ACCESS_STATUS); + if (ret < 0) + return ret; + + if (val < 0) + return val; + + return 0; +} + +static int cs35l56_sdw_slow_read(struct sdw_slave *peripheral, unsigned int reg, + u8 *buf, size_t val_size) +{ + int ret, i; + + reg += CS35L56_SDW_ADDR_OFFSET; + + for (i = 0; i < val_size; i += sizeof(u32)) { + /* Poll for bus bridge idle */ + ret = cs35l56_sdw_poll_mem_status(peripheral, + CS35L56_SDW_CMD_IN_PROGRESS, + 0); + if (ret < 0) { + dev_err(&peripheral->dev, "!CMD_IN_PROGRESS fail: %d\n", ret); + return ret; + } + + /* Reading LSByte triggers read of register to holding buffer */ + sdw_read_no_pm(peripheral, reg + i); + + /* Wait for data available */ + ret = cs35l56_sdw_poll_mem_status(peripheral, + CS35L56_SDW_RDATA_RDY, + CS35L56_SDW_RDATA_RDY); + if (ret < 0) { + dev_err(&peripheral->dev, "RDATA_RDY fail: %d\n", ret); + return ret; + } + + /* Read data from buffer */ + ret = sdw_nread_no_pm(peripheral, CS35L56_SDW_MEM_READ_DATA, + sizeof(u32), &buf[i]); + if (ret) { + dev_err(&peripheral->dev, "Late read @%#x failed: %d\n", reg + i, ret); + return ret; + } + + swab32s((u32 *)&buf[i]); + } + + return 0; +} + static int cs35l56_sdw_read_one(struct sdw_slave *peripheral, unsigned int reg, void *buf) { int ret; @@ -48,6 +121,10 @@ static int cs35l56_sdw_read(void *context, const void *reg_buf, int ret; reg = le32_to_cpu(*(const __le32 *)reg_buf); + + if (cs35l56_is_otp_register(reg)) + return cs35l56_sdw_slow_read(peripheral, reg, buf8, val_size); + reg += CS35L56_SDW_ADDR_OFFSET; if (val_size == 4) From 4e436f6fb95e507131df78c0d98052237db60ecc Mon Sep 17 00:00:00 2001 From: Jared McArthur Date: Thu, 1 Aug 2024 16:04:12 -0500 Subject: [PATCH 0518/1052] arm64: dts: ti: k3-am62p: Add gpio-ranges for mcu_gpio0 Commit d72d73a44c3c ("arm64: dts: ti: k3-am62p: Add gpio-ranges properties") introduced pinmux range definition for gpio-ranges, however missed introducing the range description for the mcu_gpio node. As a result, automatic mapping of GPIO to pin control for mcu gpios is broken. Fix this by introducing the proper ranges. Fixes: d72d73a44c3c ("arm64: dts: ti: k3-am62p: Add gpio-ranges properties") Signed-off-by: Jared McArthur Link: https://lore.kernel.org/r/20240801210414.715306-2-j-mcarthur@ti.com Signed-off-by: Nishanth Menon --- arch/arm64/boot/dts/ti/k3-am62p-j722s-common-mcu.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/ti/k3-am62p-j722s-common-mcu.dtsi b/arch/arm64/boot/dts/ti/k3-am62p-j722s-common-mcu.dtsi index e65db6ce02bf..df7945156397 100644 --- a/arch/arm64/boot/dts/ti/k3-am62p-j722s-common-mcu.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am62p-j722s-common-mcu.dtsi @@ -146,6 +146,8 @@ mcu_gpio0: gpio@4201000 { power-domains = <&k3_pds 79 TI_SCI_PD_EXCLUSIVE>; clocks = <&k3_clks 79 0>; clock-names = "gpio"; + gpio-ranges = <&mcu_pmx0 0 0 21>, <&mcu_pmx0 21 23 1>, + <&mcu_pmx0 22 32 2>; }; mcu_rti0: watchdog@4880000 { From 98897a300859dca62f834a5d1f60267032a9fe54 Mon Sep 17 00:00:00 2001 From: Jared McArthur Date: Thu, 1 Aug 2024 16:04:13 -0500 Subject: [PATCH 0519/1052] arm64: dts: ti: k3-am62p: Fix gpio-range for main_pmx0 Commit d72d73a44c3c ("arm64: dts: ti: k3-am62p: Add gpio-ranges properties") introduced pinmux range definition for gpio-ranges, however missed a hole within gpio-range for main_pmx0. As a result, automatic mapping of GPIO to pin control for gpios within the main_pmx0 domain is broken. Fix this by correcting the gpio-range. Fixes: d72d73a44c3c ("arm64: dts: ti: k3-am62p: Add gpio-ranges properties") Signed-off-by: Jared McArthur Link: https://lore.kernel.org/r/20240801210414.715306-3-j-mcarthur@ti.com Signed-off-by: Nishanth Menon --- arch/arm64/boot/dts/ti/k3-am62p-main.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/ti/k3-am62p-main.dtsi b/arch/arm64/boot/dts/ti/k3-am62p-main.dtsi index 57383bd2eaeb..0ce9721b4176 100644 --- a/arch/arm64/boot/dts/ti/k3-am62p-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am62p-main.dtsi @@ -45,7 +45,8 @@ &inta_main_dmss { &main_pmx0 { pinctrl-single,gpio-range = <&main_pmx0_range 0 32 PIN_GPIO_RANGE_IOPAD>, - <&main_pmx0_range 33 92 PIN_GPIO_RANGE_IOPAD>, + <&main_pmx0_range 33 38 PIN_GPIO_RANGE_IOPAD>, + <&main_pmx0_range 72 22 PIN_GPIO_RANGE_IOPAD>, <&main_pmx0_range 137 5 PIN_GPIO_RANGE_IOPAD>, <&main_pmx0_range 143 3 PIN_GPIO_RANGE_IOPAD>, <&main_pmx0_range 149 2 PIN_GPIO_RANGE_IOPAD>; From 04c90681144c40619524367c69e40736a6fa690c Mon Sep 17 00:00:00 2001 From: Jared McArthur Date: Thu, 1 Aug 2024 16:04:14 -0500 Subject: [PATCH 0520/1052] arm64: dts: ti: k3-j722s: Fix gpio-range for main_pmx0 Commit 5e5c50964e2e ("arm64: dts: ti: k3-j722s: Add gpio-ranges properties") introduced pinmux range definition for gpio-ranges, however missed a hole within gpio-range for main_pmx0. As a result, automatic mapping of GPIO to pin control for gpios within the main_pmx0 domain is broken. Fix this by correcting the gpio-range. Fixes: 5e5c50964e2e ("arm64: dts: ti: k3-j722s: Add gpio-ranges properties") Signed-off-by: Jared McArthur Link: https://lore.kernel.org/r/20240801210414.715306-4-j-mcarthur@ti.com Signed-off-by: Nishanth Menon --- arch/arm64/boot/dts/ti/k3-j722s-main.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/ti/k3-j722s-main.dtsi b/arch/arm64/boot/dts/ti/k3-j722s-main.dtsi index c797980528ec..dde4bd5c6645 100644 --- a/arch/arm64/boot/dts/ti/k3-j722s-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j722s-main.dtsi @@ -193,7 +193,8 @@ &inta_main_dmss { &main_pmx0 { pinctrl-single,gpio-range = <&main_pmx0_range 0 32 PIN_GPIO_RANGE_IOPAD>, - <&main_pmx0_range 33 55 PIN_GPIO_RANGE_IOPAD>, + <&main_pmx0_range 33 38 PIN_GPIO_RANGE_IOPAD>, + <&main_pmx0_range 72 17 PIN_GPIO_RANGE_IOPAD>, <&main_pmx0_range 101 25 PIN_GPIO_RANGE_IOPAD>, <&main_pmx0_range 137 5 PIN_GPIO_RANGE_IOPAD>, <&main_pmx0_range 143 3 PIN_GPIO_RANGE_IOPAD>, From 7fef1eb0b013eaa42019a95a08f71368e5a22dba Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Tue, 6 Jun 2023 16:46:28 +0100 Subject: [PATCH 0521/1052] docs: KVM: Fix register ID of SPSR_FIQ Fixes the register ID of SPSR_FIQ. SPSR_FIQ is a 64-bit register and the 64-bit register size mask is 0x0030000000000000ULL. Fixes: fd3bc912d3d1 ("KVM: Documentation: Document arm64 core registers in detail") Signed-off-by: Takahiro Itazuri Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230606154628.95498-1-itazur@amazon.com Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index fe722c5dada9..87865c8897ca 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2592,7 +2592,7 @@ Specifically: 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] - 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] + 0x6030 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] [1]_ 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] [1]_ ... From 26fef9d0bbeba6bf5d18386bd20aff2c83caa0ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 5 Aug 2024 22:35:43 +0200 Subject: [PATCH 0522/1052] syscalls: fix fstat() entry again The previous patch to fix the newfstatat() syscall entry ended up breaking fstat() instead. Unfortunately these two are not handled the same way, so I messed this one up the exact opposite way. Fixes: 343416f0c11c ("syscalls: fix syscall macros for newfstat/newfstatat") Signed-off-by: Arnd Bergmann --- scripts/syscall.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index 4586a18dfe9b..b93d43561a2c 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -100,7 +100,7 @@ 79 stat64 fstatat64 sys_fstatat64 79 64 newfstatat sys_newfstatat 80 stat64 fstat64 sys_fstat64 -80 64 newfstat sys_newfstat +80 64 fstat sys_newfstat 81 common sync sys_sync 82 common fsync sys_fsync 83 common fdatasync sys_fdatasync From f91f7ac900e7342e0fd66093dfbf7cb8cb585a99 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 17 Jul 2024 15:00:23 +0200 Subject: [PATCH 0523/1052] refcount: Report UAF for refcount_sub_and_test(0) when counter==0 When a reference counter is at zero and refcount_sub_and_test() is invoked to subtract zero, the function accepts this request without any warning and returns true. This behavior does not seem ideal because the counter being already at zero indicates a use-after-free. Furthermore, returning true by refcount_sub_and_test() in this case potentially results in a double-free done by its caller. Modify the underlying function __refcount_sub_and_test() to warn about this case as a use-after-free and have it return false to avoid the potential double-free. Signed-off-by: Petr Pavlu Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240717130023.5675-1-petr.pavlu@suse.com Signed-off-by: Kees Cook --- drivers/misc/lkdtm/refcount.c | 16 ++++++++++++++++ include/linux/refcount.h | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/misc/lkdtm/refcount.c b/drivers/misc/lkdtm/refcount.c index 5cd488f54cfa..8f744bee6fbd 100644 --- a/drivers/misc/lkdtm/refcount.c +++ b/drivers/misc/lkdtm/refcount.c @@ -182,6 +182,21 @@ static void lkdtm_REFCOUNT_SUB_AND_TEST_NEGATIVE(void) check_negative(&neg, 3); } +/* + * A refcount_sub_and_test() by zero when the counter is at zero should act like + * refcount_sub_and_test() above when going negative. + */ +static void lkdtm_REFCOUNT_SUB_AND_TEST_ZERO(void) +{ + refcount_t neg = REFCOUNT_INIT(0); + + pr_info("attempting bad refcount_sub_and_test() at zero\n"); + if (refcount_sub_and_test(0, &neg)) + pr_warn("Weird: refcount_sub_and_test() reported zero\n"); + + check_negative(&neg, 0); +} + static void check_from_zero(refcount_t *ref) { switch (refcount_read(ref)) { @@ -400,6 +415,7 @@ static struct crashtype crashtypes[] = { CRASHTYPE(REFCOUNT_DEC_NEGATIVE), CRASHTYPE(REFCOUNT_DEC_AND_TEST_NEGATIVE), CRASHTYPE(REFCOUNT_SUB_AND_TEST_NEGATIVE), + CRASHTYPE(REFCOUNT_SUB_AND_TEST_ZERO), CRASHTYPE(REFCOUNT_INC_ZERO), CRASHTYPE(REFCOUNT_ADD_ZERO), CRASHTYPE(REFCOUNT_INC_SATURATED), diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 59b3b752394d..35f039ecb272 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -266,12 +266,12 @@ bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) if (oldp) *oldp = old; - if (old == i) { + if (old > 0 && old == i) { smp_acquire__after_ctrl_dep(); return true; } - if (unlikely(old < 0 || old - i < 0)) + if (unlikely(old <= 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; From f32e90c0688a3d1f8079ac18ed39b752d22e92bd Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 23 Jul 2024 18:53:31 +0200 Subject: [PATCH 0524/1052] gcc-plugins: randstruct: Remove GCC 4.7 or newer requirement Since the kernel currently requires GCC 5.1 as a minimum, remove the unnecessary GCC version >= 4.7 check. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20240723165332.1947-1-thorsten.blum@toblux.com Signed-off-by: Kees Cook --- scripts/gcc-plugins/randomize_layout_plugin.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c index 746ff2d272f2..5694df3da2e9 100644 --- a/scripts/gcc-plugins/randomize_layout_plugin.c +++ b/scripts/gcc-plugins/randomize_layout_plugin.c @@ -19,10 +19,6 @@ #include "gcc-common.h" #include "randomize_layout_seed.h" -#if BUILDING_GCC_MAJOR < 4 || (BUILDING_GCC_MAJOR == 4 && BUILDING_GCC_MINOR < 7) -#error "The RANDSTRUCT plugin requires GCC 4.7 or newer." -#endif - #define ORIG_TYPE_NAME(node) \ (TYPE_NAME(TYPE_MAIN_VARIANT(node)) != NULL_TREE ? ((const unsigned char *)IDENTIFIER_POINTER(TYPE_NAME(TYPE_MAIN_VARIANT(node)))) : (const unsigned char *)"anonymous") From 9a2fa1472083580b6c66bdaf291f591e1170123a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Aug 2024 18:02:00 -0400 Subject: [PATCH 0525/1052] fix bitmap corruption on close_range() with CLOSE_RANGE_UNSHARE copy_fd_bitmaps(new, old, count) is expected to copy the first count/BITS_PER_LONG bits from old->full_fds_bits[] and fill the rest with zeroes. What it does is copying enough words (BITS_TO_LONGS(count/BITS_PER_LONG)), then memsets the rest. That works fine, *if* all bits past the cutoff point are clear. Otherwise we are risking garbage from the last word we'd copied. For most of the callers that is true - expand_fdtable() has count equal to old->max_fds, so there's no open descriptors past count, let alone fully occupied words in ->open_fds[], which is what bits in ->full_fds_bits[] correspond to. The other caller (dup_fd()) passes sane_fdtable_size(old_fdt, max_fds), which is the smallest multiple of BITS_PER_LONG that covers all opened descriptors below max_fds. In the common case (copying on fork()) max_fds is ~0U, so all opened descriptors will be below it and we are fine, by the same reasons why the call in expand_fdtable() is safe. Unfortunately, there is a case where max_fds is less than that and where we might, indeed, end up with junk in ->full_fds_bits[] - close_range(from, to, CLOSE_RANGE_UNSHARE) with * descriptor table being currently shared * 'to' being above the current capacity of descriptor table * 'from' being just under some chunk of opened descriptors. In that case we end up with observably wrong behaviour - e.g. spawn a child with CLONE_FILES, get all descriptors in range 0..127 open, then close_range(64, ~0U, CLOSE_RANGE_UNSHARE) and watch dup(0) ending up with descriptor #128, despite #64 being observably not open. The minimally invasive fix would be to deal with that in dup_fd(). If this proves to add measurable overhead, we can go that way, but let's try to fix copy_fd_bitmaps() first. * new helper: bitmap_copy_and_expand(to, from, bits_to_copy, size). * make copy_fd_bitmaps() take the bitmap size in words, rather than bits; it's 'count' argument is always a multiple of BITS_PER_LONG, so we are not losing any information, and that way we can use the same helper for all three bitmaps - compiler will see that count is a multiple of BITS_PER_LONG for the large ones, so it'll generate plain memcpy()+memset(). Reproducer added to tools/testing/selftests/core/close_range_test.c Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/file.c | 28 +++++++-------- include/linux/bitmap.h | 12 +++++++ .../testing/selftests/core/close_range_test.c | 35 +++++++++++++++++++ 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/fs/file.c b/fs/file.c index a11e59b5d602..655338effe9c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -46,27 +46,23 @@ static void free_fdtable_rcu(struct rcu_head *rcu) #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) +#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ -static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, - unsigned int count) +static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, + unsigned int copy_words) { - unsigned int cpy, set; + unsigned int nwords = fdt_words(nfdt); - cpy = count / BITS_PER_BYTE; - set = (nfdt->max_fds - count) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)nfdt->open_fds + cpy, 0, set); - memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)nfdt->close_on_exec + cpy, 0, set); - - cpy = BITBIT_SIZE(count); - set = BITBIT_SIZE(nfdt->max_fds) - cpy; - memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); - memset((char *)nfdt->full_fds_bits + cpy, 0, set); + bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, + copy_words, nwords); } /* @@ -84,7 +80,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); - copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); + copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* @@ -379,7 +375,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int open_files = sane_fdtable_size(old_fdt, max_fds); } - copy_fd_bitmaps(new_fdt, old_fdt, open_files); + copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 8c4768c44a01..d3b66d77df7a 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -270,6 +270,18 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst, dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); } +static inline void bitmap_copy_and_extend(unsigned long *to, + const unsigned long *from, + unsigned int count, unsigned int size) +{ + unsigned int copy = BITS_TO_LONGS(count); + + memcpy(to, from, copy * sizeof(long)); + if (count % BITS_PER_LONG) + to[copy - 1] &= BITMAP_LAST_WORD_MASK(count); + memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long)); +} + /* * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 * machines the order of hi and lo parts of numbers match the bitmap structure. diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 991c473e3859..12b4eb9d0434 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -589,4 +589,39 @@ TEST(close_range_cloexec_unshare_syzbot) EXPECT_EQ(close(fd3), 0); } +TEST(close_range_bitmap_corruption) +{ + pid_t pid; + int status; + struct __clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + /* get the first 128 descriptors open */ + for (int i = 2; i < 128; i++) + EXPECT_GE(dup2(0, i), 0); + + /* get descriptor table shared */ + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* unshare and truncate descriptor table down to 64 */ + if (sys_close_range(64, ~0U, CLOSE_RANGE_UNSHARE)) + exit(EXIT_FAILURE); + + ASSERT_EQ(fcntl(64, F_GETFD), -1); + /* ... and verify that the range 64..127 is not + stuck "fully used" according to secondary bitmap */ + EXPECT_EQ(dup(0), 64) + exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + TEST_HARNESS_MAIN From 92c4ee25208d0f35dafc3213cdf355fbe449e078 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 2 Aug 2024 11:07:30 +0300 Subject: [PATCH 0526/1052] net: bridge: mcast: wait for previous gc cycles when removing port syzbot hit a use-after-free[1] which is caused because the bridge doesn't make sure that all previous garbage has been collected when removing a port. What happens is: CPU 1 CPU 2 start gc cycle remove port acquire gc lock first wait for lock call br_multicasg_gc() directly acquire lock now but free port the port can be freed while grp timers still running Make sure all previous gc cycles have finished by using flush_work before freeing the port. [1] BUG: KASAN: slab-use-after-free in br_multicast_port_group_expired+0x4c0/0x550 net/bridge/br_multicast.c:861 Read of size 8 at addr ffff888071d6d000 by task syz.5.1232/9699 CPU: 1 PID: 9699 Comm: syz.5.1232 Not tainted 6.10.0-rc5-syzkaller-00021-g24ca36a562d6 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/07/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0xc3/0x620 mm/kasan/report.c:488 kasan_report+0xd9/0x110 mm/kasan/report.c:601 br_multicast_port_group_expired+0x4c0/0x550 net/bridge/br_multicast.c:861 call_timer_fn+0x1a3/0x610 kernel/time/timer.c:1792 expire_timers kernel/time/timer.c:1843 [inline] __run_timers+0x74b/0xaf0 kernel/time/timer.c:2417 __run_timer_base kernel/time/timer.c:2428 [inline] __run_timer_base kernel/time/timer.c:2421 [inline] run_timer_base+0x111/0x190 kernel/time/timer.c:2437 Reported-by: syzbot+263426984509be19c9a0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=263426984509be19c9a0 Fixes: e12cec65b554 ("net: bridge: mcast: destroy all entries via gc") Signed-off-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20240802080730.3206303-1-razor@blackwall.org Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 9a1cb5079a7a..b2ae0d2434d2 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2045,16 +2045,14 @@ void br_multicast_del_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; struct net_bridge_port_group *pg; - HLIST_HEAD(deleted_head); struct hlist_node *n; /* Take care of the remaining groups, only perm ones should be left */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) br_multicast_find_del_pg(br, pg); - hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); - br_multicast_gc(&deleted_head); + flush_work(&br->mcast_gc_work); br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } From e2006140ad2e01a02ed0aff49cc2ae3ceeb11f8d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 13 Jun 2024 15:05:03 +0300 Subject: [PATCH 0527/1052] thunderbolt: Mark XDomain as unplugged when router is removed I noticed that when we do discrete host router NVM upgrade and it gets hot-removed from the PCIe side as a result of NVM firmware authentication, if there is another host connected with enabled paths we hang in tearing them down. This is due to fact that the Thunderbolt networking driver also tries to cleanup the paths and ends up blocking in tb_disconnect_xdomain_paths() waiting for the domain lock. However, at this point we already cleaned the paths in tb_stop() so there is really no need for tb_disconnect_xdomain_paths() to do that anymore. Furthermore it already checks if the XDomain is unplugged and bails out early so take advantage of that and mark the XDomain as unplugged when we remove the parent router. Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 326433df5880..6a2116cbb06f 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -3392,6 +3392,7 @@ void tb_switch_remove(struct tb_switch *sw) tb_switch_remove(port->remote->sw); port->remote = NULL; } else if (port->xdomain) { + port->xdomain->is_unplugged = true; tb_xdomain_remove(port->xdomain); port->xdomain = NULL; } From 5a44bb061d04b0306f2aa8add761d86d152b9377 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 1 Aug 2024 14:31:09 +0200 Subject: [PATCH 0528/1052] KVM: s390: fix validity interception issue when gisa is switched off We might run into a SIE validity if gisa has been disabled either via using kernel parameter "kvm.use_gisa=0" or by setting the related sysfs attribute to N (echo N >/sys/module/kvm/parameters/use_gisa). The validity is caused by an invalid value in the SIE control block's gisa designation. That happens because we pass the uninitialized gisa origin to virt_to_phys() before writing it to the gisa designation. To fix this we return 0 in kvm_s390_get_gisa_desc() if the origin is 0. kvm_s390_get_gisa_desc() is used to determine which gisa designation to set in the SIE control block. A value of 0 in the gisa designation disables gisa usage. The issue surfaces in the host kernel with the following kernel message as soon a new kvm guest start is attemted. kvm: unhandled validity intercept 0x1011 WARNING: CPU: 0 PID: 781237 at arch/s390/kvm/intercept.c:101 kvm_handle_sie_intercept+0x42e/0x4d0 [kvm] Modules linked in: vhost_net tap tun xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT xt_tcpudp nft_compat x_tables nf_nat_tftp nf_conntrack_tftp vfio_pci_core irqbypass vhost_vsock vmw_vsock_virtio_transport_common vsock vhost vhost_iotlb kvm nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables sunrpc mlx5_ib ib_uverbs ib_core mlx5_core uvdevice s390_trng eadm_sch vfio_ccw zcrypt_cex4 mdev vfio_iommu_type1 vfio sch_fq_codel drm i2c_core loop drm_panel_orientation_quirks configfs nfnetlink lcs ctcm fsm dm_service_time ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common dm_mirror dm_region_hash dm_log zfcp scsi_transport_fc scsi_dh_rdac scsi_dh_emc scsi_dh_alua pkey zcrypt dm_multipath rng_core autofs4 [last unloaded: vfio_pci] CPU: 0 PID: 781237 Comm: CPU 0/KVM Not tainted 6.10.0-08682-gcad9f11498ea #6 Hardware name: IBM 3931 A01 701 (LPAR) Krnl PSW : 0704c00180000000 000003d93deb0122 (kvm_handle_sie_intercept+0x432/0x4d0 [kvm]) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 Krnl GPRS: 000003d900000027 000003d900000023 0000000000000028 000002cd00000000 000002d063a00900 00000359c6daf708 00000000000bebb5 0000000000001eff 000002cfd82e9000 000002cfd80bc000 0000000000001011 000003d93deda412 000003ff8962df98 000003d93de77ce0 000003d93deb011e 00000359c6daf960 Krnl Code: 000003d93deb0112: c020fffe7259 larl %r2,000003d93de7e5c4 000003d93deb0118: c0e53fa8beac brasl %r14,000003d9bd3c7e70 #000003d93deb011e: af000000 mc 0,0 >000003d93deb0122: a728ffea lhi %r2,-22 000003d93deb0126: a7f4fe24 brc 15,000003d93deafd6e 000003d93deb012a: 9101f0b0 tm 176(%r15),1 000003d93deb012e: a774fe48 brc 7,000003d93deafdbe 000003d93deb0132: 40a0f0ae sth %r10,174(%r15) Call Trace: [<000003d93deb0122>] kvm_handle_sie_intercept+0x432/0x4d0 [kvm] ([<000003d93deb011e>] kvm_handle_sie_intercept+0x42e/0x4d0 [kvm]) [<000003d93deacc10>] vcpu_post_run+0x1d0/0x3b0 [kvm] [<000003d93deaceda>] __vcpu_run+0xea/0x2d0 [kvm] [<000003d93dead9da>] kvm_arch_vcpu_ioctl_run+0x16a/0x430 [kvm] [<000003d93de93ee0>] kvm_vcpu_ioctl+0x190/0x7c0 [kvm] [<000003d9bd728b4e>] vfs_ioctl+0x2e/0x70 [<000003d9bd72a092>] __s390x_sys_ioctl+0xc2/0xd0 [<000003d9be0e9222>] __do_syscall+0x1f2/0x2e0 [<000003d9be0f9a90>] system_call+0x70/0x98 Last Breaking-Event-Address: [<000003d9bd3c7f58>] __warn_printk+0xe8/0xf0 Cc: stable@vger.kernel.org Reported-by: Christian Borntraeger Fixes: fe0ef0030463 ("KVM: s390: sort out physical vs virtual pointers usage") Signed-off-by: Michael Mueller Tested-by: Christian Borntraeger Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240801123109.2782155-1-mimu@linux.ibm.com Message-ID: <20240801123109.2782155-1-mimu@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index bf8534218af3..e680c6bf0c9d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -267,7 +267,12 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) { - u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); + u32 gd; + + if (!kvm->arch.gisa_int.origin) + return 0; + + gd = virt_to_phys(kvm->arch.gisa_int.origin); if (gd && sclp.has_gisaf) gd |= GISA_FORMAT1; From 7e1e206b99f4b3345aeb49d94584a420b7887f1d Mon Sep 17 00:00:00 2001 From: Steven 'Steve' Kendall Date: Tue, 6 Aug 2024 00:08:24 +0000 Subject: [PATCH 0529/1052] ALSA: hda: Add HP MP9 G4 Retail System AMS to force connect list In recent HP UEFI firmware (likely v2.15 and above, tested on 2.27), these pins are incorrectly set for HDMI/DP audio. Tested on HP MP9 G4 Retail System AMS. Tested audio with two monitors connected via DisplayPort. Link: https://forum.manjaro.org/t/intel-cannon-lake-pch-cavs-conexant-cx20632-no-sound-at-hdmi-or-displayport/133494 Link: https://bbs.archlinux.org/viewtopic.php?id=270523 Signed-off-by: Steven 'Steve' Kendall Cc: Link: https://patch.msgid.link/20240806-hdmi-audio-hp-wrongpins-v2-1-d9eb4ad41043@chromium.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 707d203ba652..4e7361d1d518 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -1989,6 +1989,7 @@ static int hdmi_add_cvt(struct hda_codec *codec, hda_nid_t cvt_nid) } static const struct snd_pci_quirk force_connect_list[] = { + SND_PCI_QUIRK(0x103c, 0x83ef, "HP MP9 G4 Retail System AMS", 1), SND_PCI_QUIRK(0x103c, 0x870f, "HP", 1), SND_PCI_QUIRK(0x103c, 0x871a, "HP", 1), SND_PCI_QUIRK(0x103c, 0x8711, "HP", 1), From b82c1d235a30622177ce10dcb94dfd691a49922f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 5 Aug 2024 22:38:29 +0200 Subject: [PATCH 0530/1052] syscalls: add back legacy __NR_nfsservctl macro The conversion from the old unistd.h file to syscall.tbl dropped the nfsservctl macro. This one was handled inconsistently across architectures in the original introduction of the syscall.tbl format, and I went the other way on this. The syscall was already gone in linux-3.1 before the current users of the generic table (other than openrisc) first appeared, so nobody could actally use it, but putting the number back helps for consistency since there are build scripts that check the presence of all these macros. Link: https://bugzilla.redhat.com/show_bug.cgi?id=2301919 Signed-off-by: Arnd Bergmann --- scripts/syscall.tbl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index b93d43561a2c..845e24eb372e 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -53,6 +53,7 @@ 39 common umount2 sys_umount 40 common mount sys_mount 41 common pivot_root sys_pivot_root +42 common nfsservctl sys_ni_syscall 43 32 statfs64 sys_statfs64 compat_sys_statfs64 43 64 statfs sys_statfs 44 32 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 From 176fd1511dd9086ab4fa9323cb232177c6235288 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 6 Aug 2024 08:49:16 +0200 Subject: [PATCH 0531/1052] ALSA: hda/hdmi: Yet more pin fix for HP EliteDesk 800 G4 HP EliteDesk 800 G4 (PCI SSID 103c:83e2) is another Kabylake machine where BIOS misses the HDMI pin initializations. Add the quirk entry. Cc: Link: https://patch.msgid.link/20240806064918.11132-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 4e7361d1d518..78042ac2b71f 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -1989,6 +1989,7 @@ static int hdmi_add_cvt(struct hda_codec *codec, hda_nid_t cvt_nid) } static const struct snd_pci_quirk force_connect_list[] = { + SND_PCI_QUIRK(0x103c, 0x83e2, "HP EliteDesk 800 G4", 1), SND_PCI_QUIRK(0x103c, 0x83ef, "HP MP9 G4 Retail System AMS", 1), SND_PCI_QUIRK(0x103c, 0x870f, "HP", 1), SND_PCI_QUIRK(0x103c, 0x871a, "HP", 1), From 402d336053a5d827c70ec11109e079811e86e0e8 Mon Sep 17 00:00:00 2001 From: Parth Pancholi Date: Tue, 30 Jul 2024 11:37:54 +0200 Subject: [PATCH 0532/1052] arm64: dts: ti: k3-j784s4-main: Correct McASP DMAs Correct the McASP nodes - mcasp3 and mcasp4 with the right DMAs thread IDs as per TISCI documentation [1] for J784s4. This fixes the related McASPs probe failure due to incorrect DMA IDs. Link: http://downloads.ti.com/tisci/esd/latest/5_soc_doc/j784s4/psil_cfg.html#psi-l-source-and-destination-thread-ids/ [1] Fixes: 5095ec4aa1ea ("arm64: dts: ti: k3-j784s4-main: Add McASP nodes") Signed-off-by: Parth Pancholi Reviewed-by: Jayesh Choudhary Link: https://lore.kernel.org/r/20240730093754.1659782-1-parth105105@gmail.com Signed-off-by: Nishanth Menon --- arch/arm64/boot/dts/ti/k3-j784s4-main.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/ti/k3-j784s4-main.dtsi b/arch/arm64/boot/dts/ti/k3-j784s4-main.dtsi index f170f80f00c1..d4ac1c9872a5 100644 --- a/arch/arm64/boot/dts/ti/k3-j784s4-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j784s4-main.dtsi @@ -2755,7 +2755,7 @@ mcasp3: mcasp@2b30000 { interrupts = , ; interrupt-names = "tx", "rx"; - dmas = <&main_udmap 0xc500>, <&main_udmap 0x4500>; + dmas = <&main_udmap 0xc403>, <&main_udmap 0x4403>; dma-names = "tx", "rx"; clocks = <&k3_clks 268 0>; clock-names = "fck"; @@ -2773,7 +2773,7 @@ mcasp4: mcasp@2b40000 { interrupts = , ; interrupt-names = "tx", "rx"; - dmas = <&main_udmap 0xc501>, <&main_udmap 0x4501>; + dmas = <&main_udmap 0xc404>, <&main_udmap 0x4404>; dma-names = "tx", "rx"; clocks = <&k3_clks 269 0>; clock-names = "fck"; From 1ac5167b3a90c9820daa64cc65e319b2d958d686 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 2 Aug 2024 10:38:49 +0200 Subject: [PATCH 0533/1052] drm/i915/gem: Adjust vma offset for framebuffer mmap offset When mapping a framebuffer object, the virtual memory area (VMA) offset ('vm_pgoff') should be adjusted by the start of the 'vma_node' associated with the object. This ensures that the VMA offset is correctly aligned with the corresponding offset within the GGTT aperture. Increment vm_pgoff by the start of the vma_node with the offset= provided by the user. Suggested-by: Chris Wilson Signed-off-by: Andi Shyti Reviewed-by: Jonathan Cavitt Reviewed-by: Rodrigo Vivi Cc: # v4.9+ [Joonas: Add Cc: stable] Signed-off-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20240802083850.103694-2-andi.shyti@linux.intel.com (cherry picked from commit 60a2066c50058086510c91f404eb582029650970) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index a2195e28b625..ce10dd259812 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -1084,6 +1084,8 @@ int i915_gem_fb_mmap(struct drm_i915_gem_object *obj, struct vm_area_struct *vma mmo = mmap_offset_attach(obj, mmap_type, NULL); if (IS_ERR(mmo)) return PTR_ERR(mmo); + + vma->vm_pgoff += drm_vma_node_start(&mmo->vma_node); } /* From 8bdd9ef7e9b1b2a73e394712b72b22055e0e26c3 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 2 Aug 2024 10:38:50 +0200 Subject: [PATCH 0534/1052] drm/i915/gem: Fix Virtual Memory mapping boundaries calculation Calculating the size of the mapped area as the lesser value between the requested size and the actual size does not consider the partial mapping offset. This can cause page fault access. Fix the calculation of the starting and ending addresses, the total size is now deduced from the difference between the end and start addresses. Additionally, the calculations have been rewritten in a clearer and more understandable form. Fixes: c58305af1835 ("drm/i915: Use remap_io_mapping() to prefault all PTE in a single pass") Reported-by: Jann Horn Co-developed-by: Chris Wilson Signed-off-by: Chris Wilson Signed-off-by: Andi Shyti Cc: Joonas Lahtinen Cc: Matthew Auld Cc: Rodrigo Vivi Cc: # v4.9+ Reviewed-by: Jann Horn Reviewed-by: Jonathan Cavitt [Joonas: Add Requires: tag] Requires: 60a2066c5005 ("drm/i915/gem: Adjust vma offset for framebuffer mmap offset") Signed-off-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20240802083850.103694-3-andi.shyti@linux.intel.com (cherry picked from commit 97b6784753da06d9d40232328efc5c5367e53417) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 53 +++++++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index ce10dd259812..cac6d4184506 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -290,6 +290,41 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf) return i915_error_to_vmf_fault(err); } +static void set_address_limits(struct vm_area_struct *area, + struct i915_vma *vma, + unsigned long obj_offset, + unsigned long *start_vaddr, + unsigned long *end_vaddr) +{ + unsigned long vm_start, vm_end, vma_size; /* user's memory parameters */ + long start, end; /* memory boundaries */ + + /* + * Let's move into the ">> PAGE_SHIFT" + * domain to be sure not to lose bits + */ + vm_start = area->vm_start >> PAGE_SHIFT; + vm_end = area->vm_end >> PAGE_SHIFT; + vma_size = vma->size >> PAGE_SHIFT; + + /* + * Calculate the memory boundaries by considering the offset + * provided by the user during memory mapping and the offset + * provided for the partial mapping. + */ + start = vm_start; + start -= obj_offset; + start += vma->gtt_view.partial.offset; + end = start + vma_size; + + start = max_t(long, start, vm_start); + end = min_t(long, end, vm_end); + + /* Let's move back into the "<< PAGE_SHIFT" domain */ + *start_vaddr = (unsigned long)start << PAGE_SHIFT; + *end_vaddr = (unsigned long)end << PAGE_SHIFT; +} + static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) { #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT) @@ -302,14 +337,18 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) struct i915_ggtt *ggtt = to_gt(i915)->ggtt; bool write = area->vm_flags & VM_WRITE; struct i915_gem_ww_ctx ww; + unsigned long obj_offset; + unsigned long start, end; /* memory boundaries */ intel_wakeref_t wakeref; struct i915_vma *vma; pgoff_t page_offset; + unsigned long pfn; int srcu; int ret; - /* We don't use vmf->pgoff since that has the fake offset */ + obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node); page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; + page_offset += obj_offset; trace_i915_gem_object_fault(obj, page_offset, true, write); @@ -402,12 +441,14 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) if (ret) goto err_unpin; + set_address_limits(area, vma, obj_offset, &start, &end); + + pfn = (ggtt->gmadr.start + i915_ggtt_offset(vma)) >> PAGE_SHIFT; + pfn += (start - area->vm_start) >> PAGE_SHIFT; + pfn += obj_offset - vma->gtt_view.partial.offset; + /* Finally, remap it using the new GTT offset */ - ret = remap_io_mapping(area, - area->vm_start + (vma->gtt_view.partial.offset << PAGE_SHIFT), - (ggtt->gmadr.start + i915_ggtt_offset(vma)) >> PAGE_SHIFT, - min_t(u64, vma->size, area->vm_end - area->vm_start), - &ggtt->iomap); + ret = remap_io_mapping(area, start, pfn, end - start, &ggtt->iomap); if (ret) goto err_fence; From ee9a68394b4bea8b9044ec4bfdbaacf45297ecef Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 1 Aug 2024 21:14:04 +0200 Subject: [PATCH 0535/1052] riscv: Re-introduce global icache flush in patch_text_XXX() commit edf2d546bfd6 ("riscv: patch: Flush the icache right after patching to avoid illegal insns") mistakenly removed the global icache flush in patch_text_nosync() and patch_text_set_nosync() functions, so reintroduce them. Fixes: edf2d546bfd6 ("riscv: patch: Flush the icache right after patching to avoid illegal insns") Reported-by: Samuel Holland Closes: https://lore.kernel.org/linux-riscv/a28ddc26-d77a-470a-a33f-88144f717e86@sifive.com/ Signed-off-by: Alexandre Ghiti Reviewed-by: Samuel Holland Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240801191404.55181-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/patch.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 69e5796fc51f..34ef522f07a8 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -205,6 +205,8 @@ int patch_text_set_nosync(void *addr, u8 c, size_t len) int ret; ret = patch_insn_set(addr, c, len); + if (!ret) + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); return ret; } @@ -239,6 +241,8 @@ int patch_text_nosync(void *addr, const void *insns, size_t len) int ret; ret = patch_insn_write(addr, insns, len); + if (!ret) + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); return ret; } From d08e2045ebf0f5f2a97ad22cc7dae398b35354ba Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 31 Jul 2024 11:08:31 +0000 Subject: [PATCH 0536/1052] bpf: introduce new VFS based BPF kfuncs Add a new variant of bpf_d_path() named bpf_path_d_path() which takes the form of a BPF kfunc and enforces KF_TRUSTED_ARGS semantics onto its arguments. This new d_path() based BPF kfunc variant is intended to address the legacy bpf_d_path() BPF helper's susceptability to memory corruption issues [0, 1, 2] by ensuring to only operate on supplied arguments which are deemed trusted by the BPF verifier. Typically, this means that only pointers to a struct path which have been referenced counted may be supplied. In addition to the new bpf_path_d_path() BPF kfunc, we also add a KF_ACQUIRE based BPF kfunc bpf_get_task_exe_file() and KF_RELEASE counterpart BPF kfunc bpf_put_file(). This is so that the new bpf_path_d_path() BPF kfunc can be used more flexibily from within the context of a BPF LSM program. It's rather common to ascertain the backing executable file for the calling process by performing the following walk current->mm->exe_file while instrumenting a given operation from the context of the BPF LSM program. However, walking current->mm->exe_file directly is never deemed to be OK, and doing so from both inside and outside of BPF LSM program context should be considered as a bug. Using bpf_get_task_exe_file() and in turn bpf_put_file() will allow BPF LSM programs to reliably get and put references to current->mm->exe_file. As of now, all the newly introduced BPF kfuncs within this patch are limited to BPF LSM program types. These can be either sleepable or non-sleepable variants of BPF LSM program types. [0] https://lore.kernel.org/bpf/CAG48ez0ppjcT=QxU-jtCUfb5xQb3mLr=5FcwddF_VKfEBPs_Dg@mail.gmail.com/ [1] https://lore.kernel.org/bpf/20230606181714.532998-1-jolsa@kernel.org/ [2] https://lore.kernel.org/bpf/20220219113744.1852259-1-memxor@gmail.com/ Acked-by: Christian Brauner Signed-off-by: Matt Bobrowski Acked-by: Song Liu Link: https://lore.kernel.org/r/20240731110833.1834742-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- fs/Makefile | 1 + fs/bpf_fs_kfuncs.c | 123 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 fs/bpf_fs_kfuncs.c diff --git a/fs/Makefile b/fs/Makefile index 6ecc9b0a53f2..61679fd587b7 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -129,3 +129,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ obj-$(CONFIG_EROFS_FS) += erofs/ obj-$(CONFIG_VBOXSF_FS) += vboxsf/ obj-$(CONFIG_ZONEFS_FS) += zonefs/ +obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c new file mode 100644 index 000000000000..1e6e08667758 --- /dev/null +++ b/fs/bpf_fs_kfuncs.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC. */ + +#include +#include +#include +#include +#include +#include +#include + +__bpf_kfunc_start_defs(); + +/** + * bpf_get_task_exe_file - get a reference on the exe_file struct file member of + * the mm_struct that is nested within the supplied + * task_struct + * @task: task_struct of which the nested mm_struct exe_file member to get a + * reference on + * + * Get a reference on the exe_file struct file member field of the mm_struct + * nested within the supplied *task*. The referenced file pointer acquired by + * this BPF kfunc must be released using bpf_put_file(). Failing to call + * bpf_put_file() on the returned referenced struct file pointer that has been + * acquired by this BPF kfunc will result in the BPF program being rejected by + * the BPF verifier. + * + * This BPF kfunc may only be called from BPF LSM programs. + * + * Internally, this BPF kfunc leans on get_task_exe_file(), such that calling + * bpf_get_task_exe_file() would be analogous to calling get_task_exe_file() + * directly in kernel context. + * + * Return: A referenced struct file pointer to the exe_file member of the + * mm_struct that is nested within the supplied *task*. On error, NULL is + * returned. + */ +__bpf_kfunc struct file *bpf_get_task_exe_file(struct task_struct *task) +{ + return get_task_exe_file(task); +} + +/** + * bpf_put_file - put a reference on the supplied file + * @file: file to put a reference on + * + * Put a reference on the supplied *file*. Only referenced file pointers may be + * passed to this BPF kfunc. Attempting to pass an unreferenced file pointer, or + * any other arbitrary pointer for that matter, will result in the BPF program + * being rejected by the BPF verifier. + * + * This BPF kfunc may only be called from BPF LSM programs. + */ +__bpf_kfunc void bpf_put_file(struct file *file) +{ + fput(file); +} + +/** + * bpf_path_d_path - resolve the pathname for the supplied path + * @path: path to resolve the pathname for + * @buf: buffer to return the resolved pathname in + * @buf__sz: length of the supplied buffer + * + * Resolve the pathname for the supplied *path* and store it in *buf*. This BPF + * kfunc is the safer variant of the legacy bpf_d_path() helper and should be + * used in place of bpf_d_path() whenever possible. It enforces KF_TRUSTED_ARGS + * semantics, meaning that the supplied *path* must itself hold a valid + * reference, or else the BPF program will be outright rejected by the BPF + * verifier. + * + * This BPF kfunc may only be called from BPF LSM programs. + * + * Return: A positive integer corresponding to the length of the resolved + * pathname in *buf*, including the NUL termination character. On error, a + * negative integer is returned. + */ +__bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) +{ + int len; + char *ret; + + if (!buf__sz) + return -EINVAL; + + ret = d_path(path, buf, buf__sz); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + len = buf + buf__sz - ret; + memmove(buf, ret, len); + return len; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_get_task_exe_file, + KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) + +static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&bpf_fs_kfunc_set_ids, kfunc_id) || + prog->type == BPF_PROG_TYPE_LSM) + return 0; + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_fs_kfunc_set_ids, + .filter = bpf_fs_kfuncs_filter, +}; + +static int __init bpf_fs_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); +} + +late_initcall(bpf_fs_kfuncs_init); From ff358ada070fa1b64a6368147b2b56cc6a921847 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 31 Jul 2024 11:08:32 +0000 Subject: [PATCH 0537/1052] selftests/bpf: add negative tests for new VFS based BPF kfuncs Add a bunch of negative selftests responsible for asserting that the BPF verifier successfully rejects a BPF program load when the underlying BPF program misuses one of the newly introduced VFS based BPF kfuncs. The following VFS based BPF kfuncs are extensively tested within this new selftest: * struct file *bpf_get_task_exe_file(struct task_struct *); * void bpf_put_file(struct file *); * int bpf_path_d_path(struct path *, char *, size_t); Acked-by: Christian Brauner Acked-by: Song Liu Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20240731110833.1834742-3-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/bpf_experimental.h | 26 +++ .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_vfs_reject.c | 161 ++++++++++++++++++ 3 files changed, 189 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_vfs_reject.c diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 828556cdc2f0..b0668f29f7b3 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -195,6 +195,32 @@ extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __ksym; */ extern void bpf_throw(u64 cookie) __ksym; +/* Description + * Acquire a reference on the exe_file member field belonging to the + * mm_struct that is nested within the supplied task_struct. The supplied + * task_struct must be trusted/referenced. + * Returns + * A referenced file pointer pointing to the exe_file member field of the + * mm_struct nested in the supplied task_struct, or NULL. + */ +extern struct file *bpf_get_task_exe_file(struct task_struct *task) __ksym; + +/* Description + * Release a reference on the supplied file. The supplied file must be + * acquired. + */ +extern void bpf_put_file(struct file *file) __ksym; + +/* Description + * Resolve a pathname for the supplied path and store it in the supplied + * buffer. The supplied path must be trusted/referenced. + * Returns + * A positive integer corresponding to the length of the resolved pathname, + * including the NULL termination character, stored in the supplied + * buffer. On error, a negative integer is returned. + */ +extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __ksym; + /* This macro must be used to mark the exception callback corresponding to the * main program. For example: * diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 67a49d12472c..14d74ba2188e 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -85,6 +85,7 @@ #include "verifier_value_or_null.skel.h" #include "verifier_value_ptr_arith.skel.h" #include "verifier_var_off.skel.h" +#include "verifier_vfs_reject.skel.h" #include "verifier_xadd.skel.h" #include "verifier_xdp.skel.h" #include "verifier_xdp_direct_packet_access.skel.h" @@ -205,6 +206,7 @@ void test_verifier_value(void) { RUN(verifier_value); } void test_verifier_value_illegal_alu(void) { RUN(verifier_value_illegal_alu); } void test_verifier_value_or_null(void) { RUN(verifier_value_or_null); } void test_verifier_var_off(void) { RUN(verifier_var_off); } +void test_verifier_vfs_reject(void) { RUN(verifier_vfs_reject); } void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); } diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c new file mode 100644 index 000000000000..d6d3f4fcb24c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC. */ + +#include +#include +#include +#include + +#include "bpf_misc.h" +#include "bpf_experimental.h" + +static char buf[PATH_MAX]; + +SEC("lsm.s/file_open") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int BPF_PROG(get_task_exe_file_kfunc_null) +{ + struct file *acquired; + + /* Can't pass a NULL pointer to bpf_get_task_exe_file(). */ + acquired = bpf_get_task_exe_file(NULL); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/inode_getxattr") +__failure __msg("arg#0 pointer type STRUCT task_struct must point to scalar, or struct with scalar") +int BPF_PROG(get_task_exe_file_kfunc_fp) +{ + u64 x; + struct file *acquired; + struct task_struct *task; + + task = (struct task_struct *)&x; + /* Can't pass random frame pointer to bpf_get_task_exe_file(). */ + acquired = bpf_get_task_exe_file(task); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("R1 must be referenced or trusted") +int BPF_PROG(get_task_exe_file_kfunc_untrusted) +{ + struct file *acquired; + struct task_struct *parent; + + /* Walking a trusted struct task_struct returned from + * bpf_get_current_task_btf() yields an untrusted pointer. + */ + parent = bpf_get_current_task_btf()->parent; + /* Can't pass untrusted pointer to bpf_get_task_exe_file(). */ + acquired = bpf_get_task_exe_file(parent); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("Unreleased reference") +int BPF_PROG(get_task_exe_file_kfunc_unreleased) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!acquired) + return 0; + + /* Acquired but never released. */ + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("release kernel function bpf_put_file expects") +int BPF_PROG(put_file_kfunc_unacquired, struct file *file) +{ + /* Can't release an unacquired pointer. */ + bpf_put_file(file); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int BPF_PROG(path_d_path_kfunc_null) +{ + /* Can't pass NULL value to bpf_path_d_path() kfunc. */ + bpf_path_d_path(NULL, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/task_alloc") +__failure __msg("R1 must be referenced or trusted") +int BPF_PROG(path_d_path_kfunc_untrusted_from_argument, struct task_struct *task) +{ + struct path *root; + + /* Walking a trusted argument typically yields an untrusted + * pointer. This is one example of that. + */ + root = &task->fs->root; + bpf_path_d_path(root, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("R1 must be referenced or trusted") +int BPF_PROG(path_d_path_kfunc_untrusted_from_current) +{ + struct path *pwd; + struct task_struct *current; + + current = bpf_get_current_task_btf(); + /* Walking a trusted pointer returned from bpf_get_current_task_btf() + * yields an untrusted pointer. + */ + pwd = ¤t->fs->pwd; + bpf_path_d_path(pwd, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("kernel function bpf_path_d_path args#0 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") +int BPF_PROG(path_d_path_kfunc_type_mismatch, struct file *file) +{ + bpf_path_d_path((struct path *)&file->f_task_work, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("invalid access to map value, value_size=4096 off=0 size=8192") +int BPF_PROG(path_d_path_kfunc_invalid_buf_sz, struct file *file) +{ + /* bpf_path_d_path() enforces a constraint on the buffer size supplied + * by the BPF LSM program via the __sz annotation. buf here is set to + * PATH_MAX, so let's ensure that the BPF verifier rejects BPF_PROG_LOAD + * attempts if the supplied size and the actual size of the buffer + * mismatches. + */ + bpf_path_d_path(&file->f_path, buf, PATH_MAX * 2); + return 0; +} + +SEC("fentry/vfs_open") +__failure __msg("calling kernel function bpf_path_d_path is not allowed") +int BPF_PROG(path_d_path_kfunc_non_lsm, struct path *path, struct file *f) +{ + /* Calling bpf_path_d_path() from a non-LSM BPF program isn't permitted. + */ + bpf_path_d_path(path, buf, sizeof(buf)); + return 0; +} + +char _license[] SEC("license") = "GPL"; From 2b399b9b1f995d71f53e94a41e5717db46c39459 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 31 Jul 2024 11:08:33 +0000 Subject: [PATCH 0538/1052] selftests/bpf: add positive tests for new VFS based BPF kfuncs Add a bunch of positive selftests which extensively cover the various contexts and parameters in which the new VFS based BPF kfuncs may be used from. Again, the following VFS based BPF kfuncs are thoroughly tested within this new selftest: * struct file *bpf_get_task_exe_file(struct task_struct *); * void bpf_put_file(struct file *); * int bpf_path_d_path(struct path *, char *, size_t); Acked-by: Christian Brauner Acked-by: Song Liu Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20240731110833.1834742-4-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_vfs_accept.c | 85 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_vfs_accept.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 14d74ba2188e..f8f546eba488 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -85,6 +85,7 @@ #include "verifier_value_or_null.skel.h" #include "verifier_value_ptr_arith.skel.h" #include "verifier_var_off.skel.h" +#include "verifier_vfs_accept.skel.h" #include "verifier_vfs_reject.skel.h" #include "verifier_xadd.skel.h" #include "verifier_xdp.skel.h" @@ -206,6 +207,7 @@ void test_verifier_value(void) { RUN(verifier_value); } void test_verifier_value_illegal_alu(void) { RUN(verifier_value_illegal_alu); } void test_verifier_value_or_null(void) { RUN(verifier_value_or_null); } void test_verifier_var_off(void) { RUN(verifier_var_off); } +void test_verifier_vfs_accept(void) { RUN(verifier_vfs_accept); } void test_verifier_vfs_reject(void) { RUN(verifier_vfs_reject); } void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c new file mode 100644 index 000000000000..a7c0a553aa50 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC. */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "bpf_experimental.h" + +static char buf[64]; + +SEC("lsm.s/file_open") +__success +int BPF_PROG(get_task_exe_file_and_put_kfunc_from_current_sleepable) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm/file_open") +__success +int BPF_PROG(get_task_exe_file_and_put_kfunc_from_current_non_sleepable, struct file *file) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/task_alloc") +__success +int BPF_PROG(get_task_exe_file_and_put_kfunc_from_argument, + struct task_struct *task) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(task); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/inode_getattr") +__success +int BPF_PROG(path_d_path_from_path_argument, struct path *path) +{ + int ret; + + ret = bpf_path_d_path(path, buf, sizeof(buf)); + __sink(ret); + return 0; +} + +SEC("lsm.s/file_open") +__success +int BPF_PROG(path_d_path_from_file_argument, struct file *file) +{ + int ret; + struct path *path; + + /* The f_path member is a path which is embedded directly within a + * file. Therefore, a pointer to such embedded members are still + * recognized by the BPF verifier as being PTR_TRUSTED as it's + * essentially PTR_TRUSTED w/ a non-zero fixed offset. + */ + path = &file->f_path; + ret = bpf_path_d_path(path, buf, sizeof(buf)); + __sink(ret); + return 0; +} + +char _license[] SEC("license") = "GPL"; From bd44ca3de49cc1badcff7a96010fa2c64f04868c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 6 Aug 2024 11:56:45 -0400 Subject: [PATCH 0539/1052] dma-debug: avoid deadlock between dma debug vs printk and netconsole Currently the dma debugging code can end up indirectly calling printk under the radix_lock. This happens when a radix tree node allocation fails. This is a problem because the printk code, when used together with netconsole, can end up inside the dma debugging code while trying to transmit a message over netcons. This creates the possibility of either a circular deadlock on the same CPU, with that CPU trying to grab the radix_lock twice, or an ABBA deadlock between different CPUs, where one CPU grabs the console lock first and then waits for the radix_lock, while the other CPU is holding the radix_lock and is waiting for the console lock. The trace captured by lockdep is of the ABBA variant. -> #2 (&dma_entry_hash[i].lock){-.-.}-{2:2}: _raw_spin_lock_irqsave+0x5a/0x90 debug_dma_map_page+0x79/0x180 dma_map_page_attrs+0x1d2/0x2f0 bnxt_start_xmit+0x8c6/0x1540 netpoll_start_xmit+0x13f/0x180 netpoll_send_skb+0x20d/0x320 netpoll_send_udp+0x453/0x4a0 write_ext_msg+0x1b9/0x460 console_flush_all+0x2ff/0x5a0 console_unlock+0x55/0x180 vprintk_emit+0x2e3/0x3c0 devkmsg_emit+0x5a/0x80 devkmsg_write+0xfd/0x180 do_iter_readv_writev+0x164/0x1b0 vfs_writev+0xf9/0x2b0 do_writev+0x6d/0x110 do_syscall_64+0x80/0x150 entry_SYSCALL_64_after_hwframe+0x4b/0x53 -> #0 (console_owner){-.-.}-{0:0}: __lock_acquire+0x15d1/0x31a0 lock_acquire+0xe8/0x290 console_flush_all+0x2ea/0x5a0 console_unlock+0x55/0x180 vprintk_emit+0x2e3/0x3c0 _printk+0x59/0x80 warn_alloc+0x122/0x1b0 __alloc_pages_slowpath+0x1101/0x1120 __alloc_pages+0x1eb/0x2c0 alloc_slab_page+0x5f/0x150 new_slab+0x2dc/0x4e0 ___slab_alloc+0xdcb/0x1390 kmem_cache_alloc+0x23d/0x360 radix_tree_node_alloc+0x3c/0xf0 radix_tree_insert+0xf5/0x230 add_dma_entry+0xe9/0x360 dma_map_page_attrs+0x1d2/0x2f0 __bnxt_alloc_rx_frag+0x147/0x180 bnxt_alloc_rx_data+0x79/0x160 bnxt_rx_skb+0x29/0xc0 bnxt_rx_pkt+0xe22/0x1570 __bnxt_poll_work+0x101/0x390 bnxt_poll+0x7e/0x320 __napi_poll+0x29/0x160 net_rx_action+0x1e0/0x3e0 handle_softirqs+0x190/0x510 run_ksoftirqd+0x4e/0x90 smpboot_thread_fn+0x1a8/0x270 kthread+0x102/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x11/0x20 This bug is more likely than it seems, because when one CPU has run out of memory, chances are the other has too. The good news is, this bug is hidden behind the CONFIG_DMA_API_DEBUG, so not many users are likely to trigger it. Signed-off-by: Rik van Riel Reported-by: Konstantin Ovsepian Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a6e3792b15f8..d570535342cb 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -416,8 +416,11 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. + * + * Use __GFP_NOWARN because the printk from an OOM, to netconsole, could end + * up right back in the DMA debugging code, leading to a deadlock. */ -static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC); +static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC | __GFP_NOWARN); static DEFINE_SPINLOCK(radix_lock); #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) From 1c4a057d01f4432704c4dc8842b6e888a91d95df Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 29 Jul 2024 13:57:24 -0700 Subject: [PATCH 0540/1052] dt-bindings: display: panel: samsung,atna45dc02: Document ATNA45DC02 The Samsung ATNA45DC02 panel is an AMOLED eDP panel, similar to the existing ATNA45AF01 and ATNA33XC20 panel but with a higher resolution. Signed-off-by: Rob Clark Acked-by: Conor Dooley Reviewed-by: Douglas Anderson Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20240729205726.7923-1-robdclark@gmail.com --- .../bindings/display/panel/samsung,atna33xc20.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml index 5192c93fbd67..87c601bcf20a 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml @@ -17,10 +17,13 @@ properties: oneOf: # Samsung 13.3" FHD (1920x1080 pixels) eDP AMOLED panel - const: samsung,atna33xc20 - # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel - items: - - const: samsung,atna45af01 - - const: samsung,atna33xc20 + - enum: + # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel + - samsung,atna45af01 + # Samsung 14.5" 3K (2944x1840 pixels) eDP AMOLED panel + - samsung,atna45dc02 + - const: samsung,atna33xc20 enable-gpios: true port: true From 929725bd7eb4eea1f75197d9847f3f1ea5afdad1 Mon Sep 17 00:00:00 2001 From: Simon Ser Date: Wed, 31 Jul 2024 19:10:20 +0000 Subject: [PATCH 0541/1052] drm/atomic: allow no-op FB_ID updates for async flips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User-space is allowed to submit any property in an async flip as long as the value doesn't change. However we missed one case: as things stand, the kernel rejects no-op FB_ID changes on non-primary planes. Fix this by changing the conditional and skipping drm_atomic_check_prop_changes() only for FB_ID on the primary plane (instead of skipping for FB_ID on any plane). Fixes: 0e26cc72c71c ("drm: Refuse to async flip with atomic prop changes") Signed-off-by: Simon Ser Reviewed-by: André Almeida Tested-by: Xaver Hugl Cc: Alex Deucher Cc: Christian König Cc: Michel Dänzer Cc: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240731191014.878320-1-contact@emersion.fr --- drivers/gpu/drm/drm_atomic_uapi.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c index 7609c798d73d..7936c2023955 100644 --- a/drivers/gpu/drm/drm_atomic_uapi.c +++ b/drivers/gpu/drm/drm_atomic_uapi.c @@ -1071,23 +1071,16 @@ int drm_atomic_set_property(struct drm_atomic_state *state, } if (async_flip && - prop != config->prop_fb_id && - prop != config->prop_in_fence_fd && - prop != config->prop_fb_damage_clips) { + (plane_state->plane->type != DRM_PLANE_TYPE_PRIMARY || + (prop != config->prop_fb_id && + prop != config->prop_in_fence_fd && + prop != config->prop_fb_damage_clips))) { ret = drm_atomic_plane_get_property(plane, plane_state, prop, &old_val); ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); break; } - if (async_flip && plane_state->plane->type != DRM_PLANE_TYPE_PRIMARY) { - drm_dbg_atomic(prop->dev, - "[OBJECT:%d] Only primary planes can be changed during async flip\n", - obj->id); - ret = -EINVAL; - break; - } - ret = drm_atomic_plane_set_property(plane, plane_state, file_priv, prop, prop_value); From 3e7917c0cdad835a5121520fc5686d954b7a61ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2024 08:58:21 +0000 Subject: [PATCH 0542/1052] net: linkwatch: use system_unbound_wq linkwatch_event() grabs possibly very contended RTNL mutex. system_wq is not suitable for such work. Inspired by many noisy syzbot reports. 3 locks held by kworker/0:7/5266: #0: ffff888015480948 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work kernel/workqueue.c:3206 [inline] #0: ffff888015480948 ((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x90a/0x1830 kernel/workqueue.c:3312 #1: ffffc90003f6fd00 ((linkwatch_work).work){+.+.}-{0:0}, at: process_one_work kernel/workqueue.c:3207 [inline] , at: process_scheduled_works+0x945/0x1830 kernel/workqueue.c:3312 #2: ffffffff8fa6f208 (rtnl_mutex){+.+.}-{3:3}, at: linkwatch_event+0xe/0x60 net/core/link_watch.c:276 Reported-by: syzbot Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240805085821.1616528-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/link_watch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 8ec35194bfcb..ab150641142a 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -148,9 +148,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_wq, &linkwatch_work, 0); + mod_delayed_work(system_unbound_wq, &linkwatch_work, 0); else - schedule_delayed_work(&linkwatch_work, delay); + queue_delayed_work(system_unbound_wq, &linkwatch_work, delay); } From fbc05142ccdd0061f6d0e489608935943d2984a1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:01:40 -0700 Subject: [PATCH 0543/1052] perf tools: Add tools/include/uapi/README Write down the reason why we keep a copy of headers to the README file instead of adding it to every commit messages. Suggested-by: Jani Nikula Original-by: Arnaldo Carvalho de Melo Original-by: Ingo Molnar Signed-off-by: Namhyung Kim --- tools/include/uapi/README | 73 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tools/include/uapi/README diff --git a/tools/include/uapi/README b/tools/include/uapi/README new file mode 100644 index 000000000000..7147b1b2cb28 --- /dev/null +++ b/tools/include/uapi/README @@ -0,0 +1,73 @@ +Why we want a copy of kernel headers in tools? +============================================== + +There used to be no copies, with tools/ code using kernel headers +directly. From time to time tools/perf/ broke due to legitimate kernel +hacking. At some point Linus complained about such direct usage. Then we +adopted the current model. + +The way these headers are used in perf are not restricted to just +including them to compile something. + +There are sometimes used in scripts that convert defines into string +tables, etc, so some change may break one of these scripts, or new MSRs +may use some different #define pattern, etc. + +E.g.: + + $ ls -1 tools/perf/trace/beauty/*.sh | head -5 + tools/perf/trace/beauty/arch_errno_names.sh + tools/perf/trace/beauty/drm_ioctl.sh + tools/perf/trace/beauty/fadvise.sh + tools/perf/trace/beauty/fsconfig.sh + tools/perf/trace/beauty/fsmount.sh + $ + $ tools/perf/trace/beauty/fadvise.sh + static const char *fadvise_advices[] = { + [0] = "NORMAL", + [1] = "RANDOM", + [2] = "SEQUENTIAL", + [3] = "WILLNEED", + [4] = "DONTNEED", + [5] = "NOREUSE", + }; + $ + +The tools/perf/check-headers.sh script, part of the tools/ build +process, points out changes in the original files. + +So its important not to touch the copies in tools/ when doing changes in +the original kernel headers, that will be done later, when +check-headers.sh inform about the change to the perf tools hackers. + +Another explanation from Ingo Molnar: +It's better than all the alternatives we tried so far: + + - Symbolic links and direct #includes: this was the original approach but + was pushed back on from the kernel side, when tooling modified the + headers and broke them accidentally for kernel builds. + + - Duplicate self-defined ABI headers like glibc: double the maintenance + burden, double the chance for mistakes, plus there's no tech-driven + notification mechanism to look at new kernel side changes. + +What we are doing now is a third option: + + - A software-enforced copy-on-write mechanism of kernel headers to + tooling, driven by non-fatal warnings on the tooling side build when + kernel headers get modified: + + Warning: Kernel ABI header differences: + diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h + diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h + diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h + ... + + The tooling policy is to always pick up the kernel side headers as-is, + and integate them into the tooling build. The warnings above serve as a + notification to tooling maintainers that there's changes on the kernel + side. + +We've been using this for many years now, and it might seem hacky, but +works surprisingly well. + From aef21f6b6a4aae648c890e74c2322d10ab267249 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 09:59:26 -0700 Subject: [PATCH 0544/1052] tools/include: Sync uapi/drm/i915_drm.h with the kernel sources To pick up changes from: 0f1bb41bf396 drm/i915: Support replaying GPU hangs with captured context image This should be used to beautify DRM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: intel-gfx@lists.freedesktop.org Signed-off-by: Namhyung Kim --- tools/include/uapi/drm/i915_drm.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index d4d86e566e07..535cb68fdb5c 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -2163,6 +2163,15 @@ struct drm_i915_gem_context_param { * supports this per context flag. */ #define I915_CONTEXT_PARAM_LOW_LATENCY 0xe + +/* + * I915_CONTEXT_PARAM_CONTEXT_IMAGE: + * + * Allows userspace to provide own context images. + * + * Note that this is a debug API not available on production kernel builds. + */ +#define I915_CONTEXT_PARAM_CONTEXT_IMAGE 0xf /* Must be kept compact -- no holes and well documented */ /** @value: Context parameter value to be set or queried */ @@ -2564,6 +2573,24 @@ struct i915_context_param_engines { struct i915_engine_class_instance engines[N__]; \ } __attribute__((packed)) name__ +struct i915_gem_context_param_context_image { + /** @engine: Engine class & instance to be configured. */ + struct i915_engine_class_instance engine; + + /** @flags: One of the supported flags or zero. */ + __u32 flags; +#define I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX (1u << 0) + + /** @size: Size of the image blob pointed to by @image. */ + __u32 size; + + /** @mbz: Must be zero. */ + __u32 mbz; + + /** @image: Userspace memory containing the context image. */ + __u64 image; +} __attribute__((packed)); + /** * struct drm_i915_gem_context_create_ext_setparam - Context parameter * to set or query during context creation. From a625df3995c31a5d8cf46f2337b207e93bef9bdd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0545/1052] tools/include: Sync uapi/linux/kvm.h with the kernel sources And other arch-specific UAPI headers to pick up changes from: 4b23e0c199b2 KVM: Ensure new code that references immediate_exit gets extra scrutiny 85542adb65ec KVM: x86: Add KVM_RUN_X86_GUEST_MODE kvm_run flag 6fef518594bc KVM: x86: Add a capability to configure bus frequency for APIC timer 34ff65901735 x86/sev: Use kernel provided SVSM Calling Areas 5dcc1e76144f Merge tag 'kvm-x86-misc-6.11' of https://github.com/kvm-x86/linux into HEAD 9a0d2f4995dd KVM: PPC: Book3S HV: Add one-reg interface for HASHPKEYR register e9eb790b2557 KVM: PPC: Book3S HV: Add one-reg interface for HASHKEYR register 1a1e6865f516 KVM: PPC: Book3S HV: Add one-reg interface for DEXCR register This should be used to beautify KVM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h diff -u tools/arch/x86/include/uapi/asm/svm.h arch/x86/include/uapi/asm/svm.h diff -u tools/arch/powerpc/include/uapi/asm/kvm.h arch/powerpc/include/uapi/asm/kvm.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/arch/powerpc/include/uapi/asm/kvm.h | 3 ++ tools/arch/x86/include/uapi/asm/kvm.h | 49 +++++++++++++++++++++++ tools/arch/x86/include/uapi/asm/svm.h | 1 + tools/include/uapi/linux/kvm.h | 17 +++++++- 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 1691297a766a..eaeda001784e 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -645,6 +645,9 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3) #define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4) #define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5) +#define KVM_REG_PPC_DEXCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc6) +#define KVM_REG_PPC_HASHKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc7) +#define KVM_REG_PPC_HASHPKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc8) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 9fae1b73b529..bf57a824f722 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -106,6 +106,7 @@ struct kvm_ioapic_state { #define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_BUS_LOCK (1 << 1) +#define KVM_RUN_X86_GUEST_MODE (1 << 2) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -697,6 +698,11 @@ enum sev_cmd_id { /* Second time is the charm; improved versions of the above ioctls. */ KVM_SEV_INIT2, + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_NR_MAX, }; @@ -824,6 +830,48 @@ struct kvm_sev_receive_update_data { __u32 pad2; }; +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) @@ -874,5 +922,6 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SW_PROTECTED_VM 1 #define KVM_X86_SEV_VM 2 #define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 #endif /* _ASM_X86_KVM_H */ diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h index 80e1df482337..1814b413fd57 100644 --- a/tools/arch/x86/include/uapi/asm/svm.h +++ b/tools/arch/x86/include/uapi/asm/svm.h @@ -115,6 +115,7 @@ #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index e5af8c692dc0..637efc055145 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -192,11 +192,24 @@ struct kvm_xen_exit { /* Flags that describe what fields in emulation_failure hold valid data. */ #define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) +/* + * struct kvm_run can be modified by userspace at any time, so KVM must be + * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM() + * renames fields in struct kvm_run from to __unsafe when + * compiled into the kernel, ensuring that any use within KVM is obvious and + * gets extra scrutiny. + */ +#ifdef __KERNEL__ +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe +#else +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol +#endif + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ __u8 request_interrupt_window; - __u8 immediate_exit; + __u8 HINT_UNSAFE_IN_KVM(immediate_exit); __u8 padding1[6]; /* out */ @@ -918,6 +931,8 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 #define KVM_CAP_PRE_FAULT_MEMORY 236 +#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 +#define KVM_CAP_X86_GUEST_MODE 238 struct kvm_irq_routing_irqchip { __u32 irqchip; From 8ec9497d3ef34fab216e277eca5035811f06b421 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0546/1052] tools/include: Sync uapi/linux/perf.h with the kernel sources To pick up changes from: 608f6976c309 perf/x86/intel: Support new data source for Lunar Lake This should be used to beautify perf syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Ian Rogers Cc: Adrian Hunter Cc: "Liang, Kan" Cc: linux-perf-users@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/perf_event.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 3a64499b0f5d..4842c36fdf80 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1349,12 +1349,14 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ +/* 0x7 available */ #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ From b9735006762677c2cd794bfcb1463a9a6ed558dd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0547/1052] tools/include: Sync uapi/sound/asound.h with the kernel sources To pick up changes from: f05c1ffc2745 ALSA: pcm: reinvent the stream synchronization ID API This should be used to beautify sound syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/uapi/sound/asound.h include/uapi/sound/asound.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux-sound@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/perf/trace/beauty/include/uapi/sound/asound.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/trace/beauty/include/uapi/sound/asound.h b/tools/perf/trace/beauty/include/uapi/sound/asound.h index 628d46a0da92..8bf7e8a0eb6f 100644 --- a/tools/perf/trace/beauty/include/uapi/sound/asound.h +++ b/tools/perf/trace/beauty/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 18) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -334,7 +334,7 @@ union snd_pcm_sync_id { unsigned char id[16]; unsigned short id16[8]; unsigned int id32[4]; -}; +} __attribute__((deprecated)); struct snd_pcm_info { unsigned int device; /* RO/WR (control): device number */ @@ -348,7 +348,7 @@ struct snd_pcm_info { int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */ unsigned int subdevices_count; unsigned int subdevices_avail; - union snd_pcm_sync_id sync; /* hardware synchronization ID */ + unsigned char pad1[16]; /* was: hardware synchronization ID */ unsigned char reserved[64]; /* reserved for future... */ }; @@ -420,7 +420,8 @@ struct snd_pcm_hw_params { unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ - unsigned char reserved[64]; /* reserved for future */ + unsigned char sync[16]; /* R: synchronization ID (perfect sync - one clock source) */ + unsigned char reserved[48]; /* reserved for future */ }; enum { From 379d9af3f3da2da1bbfa67baf1820c72a080d1f1 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 6 Aug 2024 14:51:13 +0800 Subject: [PATCH 0548/1052] selinux: fix potential counting error in avc_add_xperms_decision() The count increases only when a node is successfully added to the linked list. Cc: stable@vger.kernel.org Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") Signed-off-by: Zhen Lei Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/avc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 32eb67fb3e42..7087cd2b802d 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -330,12 +330,12 @@ static int avc_add_xperms_decision(struct avc_node *node, { struct avc_xperms_decision_node *dest_xpd; - node->ae.xp_node->xp.len++; dest_xpd = avc_xperms_decision_alloc(src->used); if (!dest_xpd) return -ENOMEM; avc_copy_xperms_decision(&dest_xpd->xpd, src); list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head); + node->ae.xp_node->xp.len++; return 0; } From fe992163575b187405899c5abaad8ef6fb828ff6 Mon Sep 17 00:00:00 2001 From: Sarthak Singh Date: Wed, 24 Jul 2024 22:57:06 +0530 Subject: [PATCH 0549/1052] rust: Support latest version of `rust-analyzer` Sets the `sysroot` field in rust-project.json which is now needed in newer versions of rust-analyzer instead of the `sysroot_src` field. Till [1] `rust-analyzer` used to guess the `sysroot` based on the `sysroot_src` at [2]. Now `sysroot` is a required parameter for a `rust-project.json` file. It is required because `rust-analyzer` need it to find the proc-macro server [3]. In the current version of `rust-analyzer` the `sysroot_src` is only used to include the inbuilt library crates (std, core, alloc, etc) [4]. Since we already specify the core library to be included in the `rust-project.json` we don't need to define the `sysroot_src`. Code editors like VS Code try to use the latest version of rust-analyzer (which is updated every week) instead of the version of rust-analyzer that comes with the rustup toolchain (which is updated every six weeks along with the rust version). Without this change `rust-analyzer` is breaking for anyone using VS Code. As they are getting the latest version of `rust-analyzer` with the changes made in [1]. `rust-analyzer` will also start breaking for other developers as they update their rust version (assuming that also updates the rust-analyzer version on their system). This patch should work with every setup as there is no more guess work being done by `rust-analyzer`. [ Lukas, who leads the rust-analyzer team, says: `sysroot_src` is required now if you want to have the sysroot source libraries be loaded. I think we used to infer it as `{sysroot}/lib/rustlib/src/rust/library` before when only the `sysroot` field was given but that was since changed to make it possible in having a sysroot without the standard library sources (that is only have the binaries available). So if you want the library sources to be loaded by rust-analyzer you will have to set that field as well now. - Miguel ] Link: https://github.com/rust-lang/rust-analyzer/pull/17287 [1] Link: https://github.com/rust-lang/rust-analyzer/blob/f372a8a1176ff8dd5f45ab2ddd45f3530db0374f/crates/project-model/src/workspace.rs#L367-L374 [2] Link: https://github.com/rust-lang/rust-analyzer/blob/eeb192b79aeac47b40add66347022af17a74fbaf/crates/project-model/src/sysroot.rs#L180-L192 [3] Link: https://github.com/search?q=repo%3AVeykril%2Frust-analyzer%20src_root()&type=code [4] Tested-by: Dirk Behme Signed-off-by: Sarthak Singh Link: https://rust-for-linux.zulipchat.com/#narrow/stream/291565-Help/topic/How.20to.20rust-analyzer.20correctly.20working Link: https://lore.kernel.org/r/20240724172713.899399-1-sarthak.singh99@gmail.com [ Formatted comment, fixed typo and removed spurious empty line. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- scripts/generate_rust_analyzer.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index 1f10f92737f2..6c0644b6090c 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -350,7 +350,7 @@ rust-analyzer: $(Q)$(srctree)/scripts/generate_rust_analyzer.py \ --cfgs='core=$(core-cfgs)' --cfgs='alloc=$(alloc-cfgs)' \ $(realpath $(srctree)) $(realpath $(objtree)) \ - $(RUST_LIB_SRC) $(KBUILD_EXTMOD) > \ + $(rustc_sysroot) $(RUST_LIB_SRC) $(KBUILD_EXTMOD) > \ $(if $(KBUILD_EXTMOD),$(extmod_prefix),$(objtree))/rust-project.json redirect-intrinsics = \ diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py index f270c7b0cf34..d2bc63cde8c6 100755 --- a/scripts/generate_rust_analyzer.py +++ b/scripts/generate_rust_analyzer.py @@ -145,6 +145,7 @@ def main(): parser.add_argument('--cfgs', action='append', default=[]) parser.add_argument("srctree", type=pathlib.Path) parser.add_argument("objtree", type=pathlib.Path) + parser.add_argument("sysroot", type=pathlib.Path) parser.add_argument("sysroot_src", type=pathlib.Path) parser.add_argument("exttree", type=pathlib.Path, nargs="?") args = parser.parse_args() @@ -154,9 +155,12 @@ def main(): level=logging.INFO if args.verbose else logging.WARNING ) + # Making sure that the `sysroot` and `sysroot_src` belong to the same toolchain. + assert args.sysroot in args.sysroot_src.parents + rust_project = { "crates": generate_crates(args.srctree, args.objtree, args.sysroot_src, args.exttree, args.cfgs), - "sysroot_src": str(args.sysroot_src), + "sysroot": str(args.sysroot), } json.dump(rust_project, sys.stdout, sort_keys=True, indent=4) From 9ba48db9f77ce0001dbb882476fa46e092feb695 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 6 Aug 2024 20:53:31 +0800 Subject: [PATCH 0550/1052] i2c: qcom-geni: Add missing geni_icc_disable in geni_i2c_runtime_resume Add the missing geni_icc_disable() before return in geni_i2c_runtime_resume(). Fixes: bf225ed357c6 ("i2c: i2c-qcom-geni: Add interconnect support") Signed-off-by: Gaosheng Cui Reviewed-by: Vladimir Zapolskiy Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-qcom-geni.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 78f43648e9f3..365e37bba0f3 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -992,6 +992,7 @@ static int __maybe_unused geni_i2c_runtime_resume(struct device *dev) ret = geni_se_resources_on(&gi2c->se); if (ret) { clk_disable_unprepare(gi2c->core_clk); + geni_icc_disable(&gi2c->se); return ret; } From c7a19018bd557c24072b59088ad2684fd83ea3f4 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Mon, 5 Aug 2024 16:52:00 -0700 Subject: [PATCH 0551/1052] net: dsa: microchip: Fix Wake-on-LAN check to not return an error The wol variable in ksz_port_set_mac_address() is declared with random data, but the code in ksz_get_wol call may not be executed so the WAKE_MAGIC check may be invalid resulting in an error message when setting a MAC address after starting the DSA driver. Fixes: 3b454b6390c3 ("net: dsa: microchip: ksz9477: Add Wake on Magic Packet support") Signed-off-by: Tristram Ha Reviewed-by: Oleksij Rempel Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20240805235200.24982-1-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index b074b4bb0629..b120e66d5669 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -3764,6 +3764,11 @@ static int ksz_port_set_mac_address(struct dsa_switch *ds, int port, return -EBUSY; } + /* Need to initialize variable as the code to fill in settings may + * not be executed. + */ + wol.wolopts = 0; + ksz_get_wol(ds, dp->index, &wol); if (wol.wolopts & WAKE_MAGIC) { dev_err(ds->dev, From 1ca645a2f74a4290527ae27130c8611391b07dbf Mon Sep 17 00:00:00 2001 From: ZHANG Yuntian Date: Sat, 3 Aug 2024 15:46:51 +0800 Subject: [PATCH 0552/1052] net: usb: qmi_wwan: add MeiG Smart SRM825L Add support for MeiG Smart SRM825L which is based on Qualcomm 315 chip. T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2dee ProdID=4d22 Rev= 4.14 S: Manufacturer=MEIG S: Product=LTE-A Module S: SerialNumber=6f345e48 C:* #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=896mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: ZHANG Yuntian Link: https://patch.msgid.link/D1EB81385E405DFE+20240803074656.567061-1-yt@radxa.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index cfda32047cff..4823dbdf5465 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1432,6 +1432,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x1546, 0x1312, 4)}, /* u-blox LARA-R6 01B */ {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ {QMI_QUIRK_SET_DTR(0x33f8, 0x0104, 4)}, /* Rolling RW101 RMNET */ + {QMI_FIXED_INTF(0x2dee, 0x4d22, 5)}, /* MeiG Smart SRM825L */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ From eb91c456f3714c336f0812dccab422ec0e72bde4 Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Tue, 6 Aug 2024 21:33:51 -0500 Subject: [PATCH 0553/1052] ALSA: hda/realtek: Add Framework Laptop 13 (Intel Core Ultra) to quirks The Framework Laptop 13 (Intel Core Ultra) has an ALC285 that ships in a similar configuration to the ALC295 in previous models. It requires the same quirk for headset detection. Signed-off-by: Dustin L. Howett Cc: Link: https://patch.msgid.link/20240806-alsa-hda-realtek-add-framework-laptop-13-intel-core-ultra-to-quirks-v1-1-42d6ce2dbf14@howett.net Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 1645d21d422f..480e82df7a4c 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10678,6 +10678,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x8086, 0x3038, "Intel NUC 13", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. From 264b5b5980061d8c6a6a30c031cdec1179fe2bae Mon Sep 17 00:00:00 2001 From: David Gow Date: Sun, 4 Aug 2024 17:18:47 +0800 Subject: [PATCH 0554/1052] drm/i915: Allow evicting to use the requested placement In commit a78a8da51b36 ("drm/ttm: replace busy placement with flags v6"), the old system of having a separate placement list (for placements which should be used without eviction) and a 'busy' placement list (for placements which should be attempted if eviction is required) was replaced with a new one where placements could be marked 'FALLBACK' (to be attempted if eviction is required) or 'DESIRED' (to be attempted first, but not if eviction is required). i915 had always included the requested placement in the list of 'busy' placements: i.e., the placement could be used either if eviction is required or not. But when the new system was put in place, the requested (first) placement was marked 'DESIRED', so would never be used if eviction became necessary. While a bug in the original commit prevented this flag from working, when this was fixed in 4a0e7b3c ("drm/i915: fix applying placement flag"), it caused long hangs on DG2 systems with small BAR. Don't mark the requested placement DESIRED (or FALLBACK), allowing it to be used in both situations. This matches the old behaviour, and resolves the hangs. Thanks to Justin Brewer for bisecting the issue. Fixes: a78a8da51b36 ("drm/ttm: replace busy placement with flags v6") Fixes: 4a0e7b3c3753 ("drm/i915: fix applying placement flag") Link: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11255 Signed-off-by: David Gow Reviewed-by: Jonathan Cavitt Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240804091851.122186-2-david@davidgow.net (cherry picked from commit 54bf0af90844fbf18f5be3272eda69198dfdb622) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index e6f177183c0f..fb848fd8ba15 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -165,7 +165,6 @@ i915_ttm_placement_from_obj(const struct drm_i915_gem_object *obj, i915_ttm_place_from_region(num_allowed ? obj->mm.placements[0] : obj->mm.region, &places[0], obj->bo_offset, obj->base.size, flags); - places[0].flags |= TTM_PL_FLAG_DESIRED; /* Cache this on object? */ for (i = 0; i < num_allowed; ++i) { From 787db3bb6ed5cee56fc97fecdd61517d89763f0a Mon Sep 17 00:00:00 2001 From: David Gow Date: Sun, 4 Aug 2024 17:18:48 +0800 Subject: [PATCH 0555/1052] drm/i915: Attempt to get pages without eviction first In commit a78a8da51b36 ("drm/ttm: replace busy placement with flags v6"), __i915_ttm_get_pages was updated to use flags instead of the separate 'busy' placement list. However, the behaviour was subtly changed. Originally, the function would attempt to use the preferred placement without eviction, and give an opportunity to restart the operation before falling back to allowing eviction. This was unintentionally changed, as the preferred placement was not given the TTM_PL_FLAG_DESIRED flag, and so eviction could be triggered in that first pass. This caused thrashing, and a significant performance regression on DG2 systems with small BAR. For example, Minecraft and Team Fortress 2 would drop to single-digit framerates. Restore the original behaviour by marking the initial placement as desired on that first attempt. Also, rework this to use a separate struct ttm_palcement, as the individual placements are marked 'const', so hot-patching the flags is even more dodgy than before. Thanks to Justin Brewer for bisecting this. Fixes: a78a8da51b36 ("drm/ttm: replace busy placement with flags v6") Link: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11255 Signed-off-by: David Gow Reviewed-by: Jonathan Cavitt Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240804091851.122186-3-david@davidgow.net (cherry picked from commit 92653f2a572505adaf7f13f695c1907e71a1dc84) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index fb848fd8ba15..5c72462d1f57 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -778,13 +778,16 @@ static int __i915_ttm_get_pages(struct drm_i915_gem_object *obj, .interruptible = true, .no_wait_gpu = false, }; - int real_num_busy; + struct ttm_placement initial_placement; + struct ttm_place initial_place; int ret; /* First try only the requested placement. No eviction. */ - real_num_busy = placement->num_placement; - placement->num_placement = 1; - ret = ttm_bo_validate(bo, placement, &ctx); + initial_placement.num_placement = 1; + memcpy(&initial_place, placement->placement, sizeof(struct ttm_place)); + initial_place.flags |= TTM_PL_FLAG_DESIRED; + initial_placement.placement = &initial_place; + ret = ttm_bo_validate(bo, &initial_placement, &ctx); if (ret) { ret = i915_ttm_err_to_gem(ret); /* @@ -799,7 +802,6 @@ static int __i915_ttm_get_pages(struct drm_i915_gem_object *obj, * If the initial attempt fails, allow all accepted placements, * evicting if necessary. */ - placement->num_placement = real_num_busy; ret = ttm_bo_validate(bo, placement, &ctx); if (ret) return i915_ttm_err_to_gem(ret); From e688c220732e518c2eb1639e9ef77d4a9311713c Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Wed, 7 Aug 2024 17:37:11 +0800 Subject: [PATCH 0556/1052] LoongArch: Enable general EFI poweroff method efi_shutdown_init() can register a general sys_off handler named efi_power_off(). Enable this by providing efi_poweroff_required(), like arm and x86. Since EFI poweroff is also supported on LoongArch, and the enablement makes the poweroff function usable for hardwares which lack ACPI S5. We prefer ACPI poweroff rather than EFI poweroff (like x86), so we only require EFI poweroff if acpi_gbl_reduced_hardware or acpi_no_s5 is true. Cc: stable@vger.kernel.org Acked-by: Ard Biesheuvel Signed-off-by: Miao Wang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/efi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/loongarch/kernel/efi.c b/arch/loongarch/kernel/efi.c index 000825406c1f..2bf86aeda874 100644 --- a/arch/loongarch/kernel/efi.c +++ b/arch/loongarch/kernel/efi.c @@ -66,6 +66,12 @@ void __init efi_runtime_init(void) set_bit(EFI_RUNTIME_SERVICES, &efi.flags); } +bool efi_poweroff_required(void) +{ + return efi_enabled(EFI_RUNTIME_SERVICES) && + (acpi_gbl_reduced_hardware || acpi_no_s5); +} + unsigned long __initdata screen_info_table = EFI_INVALID_TABLE_ADDR; #if defined(CONFIG_SYSFB) || defined(CONFIG_EFI_EARLYCON) From 4574815abf43e2bf05643e1b3f7a2e5d6df894f0 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 7 Aug 2024 17:37:11 +0800 Subject: [PATCH 0557/1052] LoongArch: Use accessors to page table entries instead of direct dereference As very well explained in commit 20a004e7b017cce282 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing page tables"), an architecture whose page table walker can modify the PTE in parallel must use READ_ONCE()/ WRITE_ONCE() macro to avoid any compiler transformation. So apply that to LoongArch which is such an architecture, in order to avoid potential problems. Similar to commit edf955647269422e ("riscv: Use accessors to page table entries instead of direct dereference"). Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/hugetlb.h | 4 +-- arch/loongarch/include/asm/kfence.h | 6 ++-- arch/loongarch/include/asm/pgtable.h | 50 +++++++++++++++++----------- arch/loongarch/kvm/mmu.c | 8 ++--- arch/loongarch/mm/hugetlbpage.c | 6 ++-- arch/loongarch/mm/init.c | 10 +++--- arch/loongarch/mm/kasan_init.c | 10 +++--- arch/loongarch/mm/pgtable.c | 2 +- 8 files changed, 53 insertions(+), 43 deletions(-) diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index aa44b3fe43dd..5da32c00d483 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -34,7 +34,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t clear; - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); pte_val(clear) = (unsigned long)invalid_pte_table; set_pte_at(mm, addr, ptep, clear); @@ -65,7 +65,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t pte, int dirty) { - int changed = !pte_same(*ptep, pte); + int changed = !pte_same(ptep_get(ptep), pte); if (changed) { set_pte_at(vma->vm_mm, addr, ptep, pte); diff --git a/arch/loongarch/include/asm/kfence.h b/arch/loongarch/include/asm/kfence.h index 92636e82957c..da9e93024626 100644 --- a/arch/loongarch/include/asm/kfence.h +++ b/arch/loongarch/include/asm/kfence.h @@ -53,13 +53,13 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect) { pte_t *pte = virt_to_kpte(addr); - if (WARN_ON(!pte) || pte_none(*pte)) + if (WARN_ON(!pte) || pte_none(ptep_get(pte))) return false; if (protect) - set_pte(pte, __pte(pte_val(*pte) & ~(_PAGE_VALID | _PAGE_PRESENT))); + set_pte(pte, __pte(pte_val(ptep_get(pte)) & ~(_PAGE_VALID | _PAGE_PRESENT))); else - set_pte(pte, __pte(pte_val(*pte) | (_PAGE_VALID | _PAGE_PRESENT))); + set_pte(pte, __pte(pte_val(ptep_get(pte)) | (_PAGE_VALID | _PAGE_PRESENT))); preempt_disable(); local_flush_tlb_one(addr); diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 3fbf1f37c58e..85431f20a14d 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -106,6 +106,9 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define KFENCE_AREA_START (VMEMMAP_END + 1) #define KFENCE_AREA_END (KFENCE_AREA_START + KFENCE_AREA_SIZE - 1) +#define ptep_get(ptep) READ_ONCE(*(ptep)) +#define pmdp_get(pmdp) READ_ONCE(*(pmdp)) + #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #ifndef __PAGETABLE_PMD_FOLDED @@ -147,11 +150,6 @@ static inline int p4d_present(p4d_t p4d) return p4d_val(p4d) != (unsigned long)invalid_pud_table; } -static inline void p4d_clear(p4d_t *p4dp) -{ - p4d_val(*p4dp) = (unsigned long)invalid_pud_table; -} - static inline pud_t *p4d_pgtable(p4d_t p4d) { return (pud_t *)p4d_val(p4d); @@ -159,7 +157,12 @@ static inline pud_t *p4d_pgtable(p4d_t p4d) static inline void set_p4d(p4d_t *p4d, p4d_t p4dval) { - *p4d = p4dval; + WRITE_ONCE(*p4d, p4dval); +} + +static inline void p4d_clear(p4d_t *p4dp) +{ + set_p4d(p4dp, __p4d((unsigned long)invalid_pud_table)); } #define p4d_phys(p4d) PHYSADDR(p4d_val(p4d)) @@ -193,17 +196,20 @@ static inline int pud_present(pud_t pud) return pud_val(pud) != (unsigned long)invalid_pmd_table; } -static inline void pud_clear(pud_t *pudp) -{ - pud_val(*pudp) = ((unsigned long)invalid_pmd_table); -} - static inline pmd_t *pud_pgtable(pud_t pud) { return (pmd_t *)pud_val(pud); } -#define set_pud(pudptr, pudval) do { *(pudptr) = (pudval); } while (0) +static inline void set_pud(pud_t *pud, pud_t pudval) +{ + WRITE_ONCE(*pud, pudval); +} + +static inline void pud_clear(pud_t *pudp) +{ + set_pud(pudp, __pud((unsigned long)invalid_pmd_table)); +} #define pud_phys(pud) PHYSADDR(pud_val(pud)) #define pud_page(pud) (pfn_to_page(pud_phys(pud) >> PAGE_SHIFT)) @@ -231,12 +237,15 @@ static inline int pmd_present(pmd_t pmd) return pmd_val(pmd) != (unsigned long)invalid_pte_table; } -static inline void pmd_clear(pmd_t *pmdp) +static inline void set_pmd(pmd_t *pmd, pmd_t pmdval) { - pmd_val(*pmdp) = ((unsigned long)invalid_pte_table); + WRITE_ONCE(*pmd, pmdval); } -#define set_pmd(pmdptr, pmdval) do { *(pmdptr) = (pmdval); } while (0) +static inline void pmd_clear(pmd_t *pmdp) +{ + set_pmd(pmdp, __pmd((unsigned long)invalid_pte_table)); +} #define pmd_phys(pmd) PHYSADDR(pmd_val(pmd)) @@ -314,7 +323,8 @@ extern void paging_init(void); static inline void set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + WRITE_ONCE(*ptep, pteval); + if (pte_val(pteval) & _PAGE_GLOBAL) { pte_t *buddy = ptep_buddy(ptep); /* @@ -341,8 +351,8 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) : [buddy] "+m" (buddy->pte), [tmp] "=&r" (tmp) : [global] "r" (page_global)); #else /* !CONFIG_SMP */ - if (pte_none(*buddy)) - pte_val(*buddy) = pte_val(*buddy) | _PAGE_GLOBAL; + if (pte_none(ptep_get(buddy))) + WRITE_ONCE(*buddy, __pte(pte_val(ptep_get(buddy)) | _PAGE_GLOBAL)); #endif /* CONFIG_SMP */ } } @@ -350,7 +360,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { /* Preserve global status for the pair */ - if (pte_val(*ptep_buddy(ptep)) & _PAGE_GLOBAL) + if (pte_val(ptep_get(ptep_buddy(ptep))) & _PAGE_GLOBAL) set_pte(ptep, __pte(_PAGE_GLOBAL)); else set_pte(ptep, __pte(0)); @@ -603,7 +613,7 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - pmd_t old = *pmdp; + pmd_t old = pmdp_get(pmdp); pmd_clear(pmdp); diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 2634a9e8d82c..28681dfb4b85 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -714,19 +714,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value). */ - pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); + pgd = pgdp_get(pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; - p4d = READ_ONCE(*p4d_offset(&pgd, hva)); + p4d = p4dp_get(p4d_offset(&pgd, hva)); if (p4d_none(p4d) || !p4d_present(p4d)) goto out; - pud = READ_ONCE(*pud_offset(&p4d, hva)); + pud = pudp_get(pud_offset(&p4d, hva)); if (pud_none(pud) || !pud_present(pud)) goto out; - pmd = READ_ONCE(*pmd_offset(&pud, hva)); + pmd = pmdp_get(pmd_offset(&pud, hva)); if (pmd_none(pmd) || !pmd_present(pmd)) goto out; diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c index 12222c56cb59..e4068906143b 100644 --- a/arch/loongarch/mm/hugetlbpage.c +++ b/arch/loongarch/mm/hugetlbpage.c @@ -39,11 +39,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - if (pgd_present(*pgd)) { + if (pgd_present(pgdp_get(pgd))) { p4d = p4d_offset(pgd, addr); - if (p4d_present(*p4d)) { + if (p4d_present(p4dp_get(p4d))) { pud = pud_offset(p4d, addr); - if (pud_present(*pud)) + if (pud_present(pudp_get(pud))) pmd = pmd_offset(pud, addr); } } diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index bf789d114c2d..8a87a482c8f4 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -141,7 +141,7 @@ void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next) { - int huge = pmd_val(*pmd) & _PAGE_HUGE; + int huge = pmd_val(pmdp_get(pmd)) & _PAGE_HUGE; if (huge) vmemmap_verify((pte_t *)pmd, node, addr, next); @@ -173,7 +173,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) pud_t *pud; pmd_t *pmd; - if (p4d_none(*p4d)) { + if (p4d_none(p4dp_get(p4d))) { pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!pud) panic("%s: Failed to allocate memory\n", __func__); @@ -184,7 +184,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) } pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { + if (pud_none(pudp_get(pud))) { pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!pmd) panic("%s: Failed to allocate memory\n", __func__); @@ -195,7 +195,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) } pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { + if (!pmd_present(pmdp_get(pmd))) { pte_t *pte; pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); @@ -216,7 +216,7 @@ void __init __set_fixmap(enum fixed_addresses idx, BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); ptep = populate_kernel_pte(addr); - if (!pte_none(*ptep)) { + if (!pte_none(ptep_get(ptep))) { pte_ERROR(*ptep); return; } diff --git a/arch/loongarch/mm/kasan_init.c b/arch/loongarch/mm/kasan_init.c index c608adc99845..427d6b1aec09 100644 --- a/arch/loongarch/mm/kasan_init.c +++ b/arch/loongarch/mm/kasan_init.c @@ -105,7 +105,7 @@ static phys_addr_t __init kasan_alloc_zeroed_page(int node) static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, bool early) { - if (__pmd_none(early, READ_ONCE(*pmdp))) { + if (__pmd_none(early, pmdp_get(pmdp))) { phys_addr_t pte_phys = early ? __pa_symbol(kasan_early_shadow_pte) : kasan_alloc_zeroed_page(node); if (!early) @@ -118,7 +118,7 @@ static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, bool early) { - if (__pud_none(early, READ_ONCE(*pudp))) { + if (__pud_none(early, pudp_get(pudp))) { phys_addr_t pmd_phys = early ? __pa_symbol(kasan_early_shadow_pmd) : kasan_alloc_zeroed_page(node); if (!early) @@ -131,7 +131,7 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) { - if (__p4d_none(early, READ_ONCE(*p4dp))) { + if (__p4d_none(early, p4dp_get(p4dp))) { phys_addr_t pud_phys = early ? __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); if (!early) @@ -154,7 +154,7 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, : kasan_alloc_zeroed_page(node); next = addr + PAGE_SIZE; set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (ptep++, addr = next, addr != end && __pte_none(early, READ_ONCE(*ptep))); + } while (ptep++, addr = next, addr != end && __pte_none(early, ptep_get(ptep))); } static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, @@ -166,7 +166,7 @@ static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, do { next = pmd_addr_end(addr, end); kasan_pte_populate(pmdp, addr, next, node, early); - } while (pmdp++, addr = next, addr != end && __pmd_none(early, READ_ONCE(*pmdp))); + } while (pmdp++, addr = next, addr != end && __pmd_none(early, pmdp_get(pmdp))); } static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr, diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index bda018150000..eb6a29b491a7 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -128,7 +128,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t prot) void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); flush_tlb_all(); } From 296b03ce389b4f7b3d7ea5664e53d432fb17e745 Mon Sep 17 00:00:00 2001 From: Yuli Wang Date: Wed, 7 Aug 2024 17:37:14 +0800 Subject: [PATCH 0558/1052] LoongArch: KVM: Remove unnecessary definition of KVM_PRIVATE_MEM_SLOTS 1. "KVM_PRIVATE_MEM_SLOTS" is renamed as "KVM_INTERNAL_MEM_SLOTS". 2. "KVM_INTERNAL_MEM_SLOTS" defaults to zero, so it is not necessary to define it in LoongArch's asm/kvm_host.h. Link: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bdd1c37a315bc50ab14066c4852bc8dcf070451e Link: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b075450868dbc0950f0942617f222eeb989cad10 Reviewed-by: Bibo Mao Signed-off-by: Wentao Guan Signed-off-by: Yuli Wang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 44b54965f5b4..5f0677e03817 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -26,8 +26,6 @@ #define KVM_MAX_VCPUS 256 #define KVM_MAX_CPUCFG_REGS 21 -/* memory slots that does not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 0 #define KVM_HALT_POLL_NS_DEFAULT 500000 #define KVM_REQ_TLB_FLUSH_GPA KVM_ARCH_REQ(0) From 494b0792d962e8efac72b3a5b6d9bcd4e6fa8cf0 Mon Sep 17 00:00:00 2001 From: Dandan Zhang Date: Wed, 7 Aug 2024 17:37:14 +0800 Subject: [PATCH 0559/1052] LoongArch: KVM: Remove undefined a6 argument comment for kvm_hypercall() The kvm_hypercall() set for LoongArch is limited to a1-a5. So the mention of a6 in the comment is undefined that needs to be rectified. Reviewed-by: Bibo Mao Signed-off-by: Wentao Guan Signed-off-by: Dandan Zhang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_para.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_para.h b/arch/loongarch/include/asm/kvm_para.h index 335fb86778e2..43ec61589e6c 100644 --- a/arch/loongarch/include/asm/kvm_para.h +++ b/arch/loongarch/include/asm/kvm_para.h @@ -39,9 +39,9 @@ struct kvm_steal_time { * Hypercall interface for KVM hypervisor * * a0: function identifier - * a1-a6: args + * a1-a5: args * Return value will be placed in a0. - * Up to 6 arguments are passed in a1, a2, a3, a4, a5, a6. + * Up to 5 arguments are passed in a1, a2, a3, a4, a5. */ static __always_inline long kvm_hypercall0(u64 fid) { From 382b6eabb0316b7334d97afbdcf33a4e20b0ecd8 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 2 Aug 2024 14:04:27 +0000 Subject: [PATCH 0560/1052] usb: gadget: f_fs: restore ffs_func_disable() functionality The blamed commit made ffs_func_disable() always return -EINVAL as the method calls ffs_func_set_alt() with the ``alt`` argument being ``(unsigned)-1``, which is always greater than MAX_ALT_SETTINGS. Use the MAX_ALT_SETTINGS check just in the f->set_alt() code path, f->disable() doesn't care about the ``alt`` parameter. Make a surgical fix, but really the f->disable() code shall be pulled out from ffs_func_set_alt(), the code will become clearer. A patch will follow. Note that ffs_func_disable() always returning -EINVAL made pixel6 crash on USB disconnect. Fixes: 2f550553e23c ("usb: gadget: f_fs: Add the missing get_alt callback") Cc: stable Reported-by: William McVicker Signed-off-by: Tudor Ambarus Link: https://lore.kernel.org/r/20240802140428.2000312-2-tudor.ambarus@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index d8b096859337..0bfed1741b3e 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3731,10 +3731,10 @@ static int ffs_func_set_alt(struct usb_function *f, struct ffs_data *ffs = func->ffs; int ret = 0, intf; - if (alt > MAX_ALT_SETTINGS) - return -EINVAL; - if (alt != (unsigned)-1) { + if (alt > MAX_ALT_SETTINGS) + return -EINVAL; + intf = ffs_func_revmap_intf(func, interface); if (intf < 0) return intf; From a59d8cc9292c58bccec7d8fa27eb59d0a3a6aa0d Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 2 Aug 2024 14:04:28 +0000 Subject: [PATCH 0561/1052] usb: gadget: f_fs: pull out f->disable() from ffs_func_set_alt() The ``alt`` parameter was used as a way to differentiate between f->disable() and f->set_alt(). As the code paths diverge quite a bit, pull out the f->disable() code from ffs_func_set_alt(), everything will become clearer and less error prone. No change in functionality intended. Signed-off-by: Tudor Ambarus Link: https://lore.kernel.org/r/20240802140428.2000312-3-tudor.ambarus@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 36 ++++++++++++++++++------------ 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 0bfed1741b3e..e0ceaa721949 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3731,14 +3731,12 @@ static int ffs_func_set_alt(struct usb_function *f, struct ffs_data *ffs = func->ffs; int ret = 0, intf; - if (alt != (unsigned)-1) { - if (alt > MAX_ALT_SETTINGS) - return -EINVAL; + if (alt > MAX_ALT_SETTINGS) + return -EINVAL; - intf = ffs_func_revmap_intf(func, interface); - if (intf < 0) - return intf; - } + intf = ffs_func_revmap_intf(func, interface); + if (intf < 0) + return intf; if (ffs->func) ffs_func_eps_disable(ffs->func); @@ -3753,12 +3751,6 @@ static int ffs_func_set_alt(struct usb_function *f, if (ffs->state != FFS_ACTIVE) return -ENODEV; - if (alt == (unsigned)-1) { - ffs->func = NULL; - ffs_event_add(ffs, FUNCTIONFS_DISABLE); - return 0; - } - ffs->func = func; ret = ffs_func_eps_enable(func); if (ret >= 0) { @@ -3770,7 +3762,23 @@ static int ffs_func_set_alt(struct usb_function *f, static void ffs_func_disable(struct usb_function *f) { - ffs_func_set_alt(f, 0, (unsigned)-1); + struct ffs_function *func = ffs_func_from_usb(f); + struct ffs_data *ffs = func->ffs; + + if (ffs->func) + ffs_func_eps_disable(ffs->func); + + if (ffs->state == FFS_DEACTIVATED) { + ffs->state = FFS_CLOSING; + INIT_WORK(&ffs->reset_work, ffs_reset_work); + schedule_work(&ffs->reset_work); + return; + } + + if (ffs->state == FFS_ACTIVE) { + ffs->func = NULL; + ffs_event_add(ffs, FUNCTIONFS_DISABLE); + } } static int ffs_func_setup(struct usb_function *f, From becac61a771a4a127e0c38c28110a55cb84d9f41 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Fri, 2 Aug 2024 14:41:56 +0800 Subject: [PATCH 0562/1052] usb: typec: tcpm: avoid sink goto SNK_UNATTACHED state if not received source capability message Since commit (122968f8dda8 usb: typec: tcpm: avoid resets for missing source capability messages), state will change from SNK_WAIT_CAPABILITIES to SNK_WAIT_CAPABILITIES_TIMEOUT. We need to change SNK_WAIT_CAPABILITIES -> SNK_READY path to SNK_WAIT_CAPABILITIES_TIMEOUT -> SNK_READY accordingly. Otherwise, the sink port will never change to SNK_READY state if the source does't have PD capability. [ 503.547183] pending state change SNK_WAIT_CAPABILITIES -> SNK_WAIT_CAPABILITIES_TIMEOUT @ 310 ms [rev3 NONE_AMS] [ 503.857239] state change SNK_WAIT_CAPABILITIES -> SNK_WAIT_CAPABILITIES_TIMEOUT [delayed 310 ms] [ 503.857254] PD TX, header: 0x87 [ 503.862440] PD TX complete, status: 2 [ 503.862484] state change SNK_WAIT_CAPABILITIES_TIMEOUT -> SNK_UNATTACHED [rev3 NONE_AMS] Fixes: 122968f8dda8 ("usb: typec: tcpm: avoid resets for missing source capability messages") Cc: stable@vger.kernel.org Signed-off-by: Xu Yang Reviewed-by: Sebastian Reichel Reviewed-by: Heikki Krogerus Reviewed-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20240802064156.1846768-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 26f9006e95e1..cce39818e99a 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -4515,7 +4515,7 @@ static inline enum tcpm_state hard_reset_state(struct tcpm_port *port) return ERROR_RECOVERY; if (port->pwr_role == TYPEC_SOURCE) return SRC_UNATTACHED; - if (port->state == SNK_WAIT_CAPABILITIES) + if (port->state == SNK_WAIT_CAPABILITIES_TIMEOUT) return SNK_READY; return SNK_UNATTACHED; } From 65ba8cef0416816b912c04850fc2468329994353 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Tue, 6 Aug 2024 14:20:29 +0300 Subject: [PATCH 0563/1052] usb: typec: ucsi: Fix a deadlock in ucsi_send_command_common() The function returns with the ppm_lock held if the PPM is busy or there's an error. Reported-and-tested-by: Luciano Coelho Fixes: 5e9c1662a89b ("usb: typec: ucsi: rework command execution functions") Signed-off-by: Heikki Krogerus Reported-by: Luciano Coelho Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20240806112029.2984319-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index dcd3765cc1f5..432a2d6266d7 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -238,13 +238,10 @@ static int ucsi_send_command_common(struct ucsi *ucsi, u64 cmd, mutex_lock(&ucsi->ppm_lock); ret = ucsi_run_command(ucsi, cmd, &cci, data, size, conn_ack); - if (cci & UCSI_CCI_BUSY) { - ret = ucsi_run_command(ucsi, UCSI_CANCEL, &cci, NULL, 0, false); - return ret ? ret : -EBUSY; - } - - if (cci & UCSI_CCI_ERROR) - return ucsi_read_error(ucsi, connector_num); + if (cci & UCSI_CCI_BUSY) + ret = ucsi_run_command(ucsi, UCSI_CANCEL, &cci, NULL, 0, false) ?: -EBUSY; + else if (cci & UCSI_CCI_ERROR) + ret = ucsi_read_error(ucsi, connector_num); mutex_unlock(&ucsi->ppm_lock); return ret; From cff59d8631e1409ffdd22d9d717e15810181b32c Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 1 Aug 2024 13:25:48 +0200 Subject: [PATCH 0564/1052] s390/uv: Panic for set and remove shared access UVC errors The return value uv_set_shared() and uv_remove_shared() (which are wrappers around the share() function) is not always checked. The system integrity of a protected guest depends on the Share and Unshare UVCs being successful. This means that any caller that fails to check the return value will compromise the security of the protected guest. No code path that would lead to such violation of the security guarantees is currently exercised, since all the areas that are shared never get unshared during the lifetime of the system. This might change and become an issue in the future. The Share and Unshare UVCs can only fail in case of hypervisor misbehaviour (either a bug or malicious behaviour). In such cases there is no reasonable way forward, and the system needs to panic. This patch replaces the return at the end of the share() function with a panic, to guarantee system integrity. Fixes: 5abb9351dfd9 ("s390/uv: introduce guest side ultravisor code") Signed-off-by: Claudio Imbrenda Reviewed-by: Christian Borntraeger Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240801112548.85303-1-imbrenda@linux.ibm.com Message-ID: <20240801112548.85303-1-imbrenda@linux.ibm.com> [frankja@linux.ibm.com: Fixed up patch subject] Signed-off-by: Janosch Frank --- arch/s390/include/asm/uv.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 0b5f8f3e84f1..153d93468b77 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -441,7 +441,10 @@ static inline int share(unsigned long addr, u16 cmd) if (!uv_call(0, (u64)&uvcb)) return 0; - return -EINVAL; + pr_err("%s UVC failed (rc: 0x%x, rrc: 0x%x), possible hypervisor bug.\n", + uvcb.header.cmd == UVC_CMD_SET_SHARED_ACCESS ? "Share" : "Unshare", + uvcb.header.rc, uvcb.header.rrc); + panic("System security cannot be guaranteed unless the system panics now.\n"); } /* From ff9bf4b34104955017822e9bc42aeeb526ee2a80 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Jul 2024 21:14:08 -0400 Subject: [PATCH 0565/1052] lockdep: Fix lockdep_set_notrack_class() for CONFIG_LOCK_STAT We won't find a contended lock if it's not being tracked. Signed-off-by: Kent Overstreet --- kernel/locking/lockdep.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 58c88220a478..0349f957e672 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5936,6 +5936,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, ip); @@ -5978,6 +5981,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, _RET_IP_); From 7442b5cdf259e2fb112560904b7002ce48d15578 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Jul 2024 20:33:25 -0400 Subject: [PATCH 0566/1052] bcachefs: Don't rely on implicit unsigned -> signed integer conversion implicit integer conversion is a fertile source of bugs, and we really would rather not have the min()/max() macros doing it implicitly. bcachefs appears to be the only place in the kernel where this happens, so let's fix it. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 8d2b62c9588e..f13e619b4b21 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -166,8 +166,8 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, * avoid overflowing LRU_TIME_BITS on a corrupted fs, when * bucket_sectors_dirty is (much) bigger than bucket_size */ - u64 d = min(bch2_bucket_sectors_dirty(a), - ca->mi.bucket_size); + u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } From 90b211fa2da3f36939e84b6426988832a62caf4b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Jul 2024 20:35:59 -0400 Subject: [PATCH 0567/1052] bcachefs: Add a comment for bucket helper types We've had bugs in the past with incorrect integer conversions in disk accounting code, which is why bucket helpers now always return s64s; add a comment explaining this. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index f13e619b4b21..96a0444ea78f 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -82,6 +82,14 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, bucket_data_type(bucket) != bucket_data_type(ptr); } +/* + * It is my general preference to use unsigned types for unsigned quantities - + * however, these helpers are used in disk accounting calculations run by + * triggers where the output will be negated and added to an s64. unsigned is + * right out even though all these quantities will fit in 32 bits, since it + * won't be sign extended correctly; u64 will negate "correctly", but s64 is the + * simpler option here. + */ static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) { return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; From 02026e8931366158d7395f87afeb0b535210dbee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Aug 2024 22:49:31 -0400 Subject: [PATCH 0568/1052] bcachefs: Add missing bch2_trans_begin() call Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 4531c9ab3e12..7ee3b75480df 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -406,6 +406,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->read_pos, BTREE_ITER_slots); retry: + bch2_trans_begin(trans); rbio->bio.bi_status = 0; k = bch2_btree_iter_peek_slot(&iter); From c1e4446247b2a8919649fb9aae2d86f53bf3d1e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Aug 2024 21:02:34 -0400 Subject: [PATCH 0569/1052] bcachefs: Improved allocator debugging for ec chasing down a device removal deadlock with erasure coding Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 6 ++++-- fs/bcachefs/alloc_foreground.h | 2 +- fs/bcachefs/ec.c | 31 ++++++++++++++++++++----------- fs/bcachefs/sysfs.c | 6 +++++- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 618d2ff0292e..8683fe4fae5b 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1603,7 +1603,8 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope prt_newline(out); } -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca) { struct open_bucket *ob; @@ -1613,7 +1614,8 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list) + if (ob->valid && !ob->on_partial_list && + (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 6da9e7e29026..c78a64ec0553 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -223,7 +223,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *); void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 9b5b5c9a6c63..8d0cca2f14ed 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2235,6 +2235,23 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) mutex_unlock(&c->ec_stripes_heap_lock); } +static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, + struct ec_stripe_new *s) +{ + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", + s->idx, s->nr_data, s->nr_parity, + bitmap_weight(s->blocks_allocated, s->nr_data), + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_watermarks[s->h->watermark]); + + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i; + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) + prt_printf(out, " %u", s->blocks[i]); + prt_newline(out); +} + void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) { struct ec_stripe_head *h; @@ -2247,23 +2264,15 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) bch2_watermarks[h->watermark]); if (h->s) - prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", - h->s->idx, h->s->nr_data, h->s->nr_parity, - bitmap_weight(h->s->blocks_allocated, - h->s->nr_data)); + bch2_new_stripe_to_text(out, c, h->s); } mutex_unlock(&c->ec_stripe_head_lock); prt_printf(out, "in flight:\n"); mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) { - prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", - s->idx, s->nr_data, s->nr_parity, - atomic_read(&s->ref[STRIPE_REF_io]), - atomic_read(&s->ref[STRIPE_REF_stripe]), - bch2_watermarks[s->h->watermark]); - } + list_for_each_entry(s, &c->ec_stripe_new_list, list) + bch2_new_stripe_to_text(out, c, s); mutex_unlock(&c->ec_stripe_new_lock); } diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 1c0d1fb20276..f393023a3ae2 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -367,7 +367,7 @@ SHOW(bch2_fs) bch2_stripes_heap_to_text(out, c); if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c); + bch2_open_buckets_to_text(out, c, NULL); if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); @@ -811,6 +811,9 @@ SHOW(bch2_dev) if (attr == &sysfs_alloc_debug) bch2_dev_alloc_debug_to_text(out, ca); + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c, ca); + return 0; } @@ -892,6 +895,7 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, + &sysfs_open_buckets, NULL }; From 2caca9fb166f82937110368768002511628e6e1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Aug 2024 23:30:48 -0400 Subject: [PATCH 0570/1052] bcachefs: ec should not allocate from ro devs This fixes a device removal deadlock when using erasure coding. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 8d0cca2f14ed..84f1cbf6497f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1809,6 +1809,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); BUG_ON(v->nr_redundant != h->s->nr_parity); + /* * We bypass the sector allocator which normally does this: */ + bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { __clear_bit(v->ptrs[i].dev, devs.d); if (i < h->s->nr_data) From 019f87f1ef967c5a5b263f21ad100f46c874505a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 19 Jul 2024 10:57:53 +0200 Subject: [PATCH 0571/1052] platform: cznic: turris-omnia-mcu: Make watchdog code optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the watchdog part of the driver optional, under a boolean config option. Move the dependency to WATCHDOG to this new option, and change the WATCHDOG_CORE dependency to selection, as is done in most watchdog drivers. This makes the turris-omnia-mcu driver available for compilation even if WATCHDOG is disabled. Fixes: ed46f1f7731d ("platform: cznic: turris-omnia-mcu: fix Kconfig dependencies") Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20240719085756.30598-2-kabel@kernel.org Signed-off-by: Arnd Bergmann --- drivers/platform/cznic/Kconfig | 17 ++++++++++++++--- drivers/platform/cznic/Makefile | 2 +- drivers/platform/cznic/turris-omnia-mcu.h | 10 ++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index cb0d4d686d8a..b56c343e21d6 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -17,11 +17,9 @@ config TURRIS_OMNIA_MCU depends on MACH_ARMADA_38X || COMPILE_TEST depends on I2C depends on OF - depends on WATCHDOG depends on GPIOLIB depends on HW_RANDOM depends on RTC_CLASS - depends on WATCHDOG_CORE select GPIOLIB_IRQCHIP help Say Y here to add support for the features implemented by the @@ -31,7 +29,6 @@ config TURRIS_OMNIA_MCU disabled) and the ability to configure wake up from this mode (via rtcwake) - true random number generator (if available on the MCU) - - MCU watchdog - GPIO pins - to get front button press events (the front button can be configured either to generate press events to the CPU or to change @@ -44,7 +41,21 @@ config TURRIS_OMNIA_MCU to be able to program SOC's OTP on board revisions 32+ - to get input from the LED output pins of the WAN ethernet PHY, LAN switch and MiniPCIe ports + Other features can be enabled by subsequent config options. To compile this driver as a module, choose M here; the module will be called turris-omnia-mcu. +if TURRIS_OMNIA_MCU + +config TURRIS_OMNIA_MCU_WATCHDOG + bool "Turris Omnia MCU watchdog" + default y + depends on WATCHDOG + select WATCHDOG_CORE + help + Say Y here to add support for watchdog provided by CZ.NIC's Turris + Omnia MCU. + +endif # TURRIS_OMNIA_MCU + endif # CZNIC_PLATFORMS diff --git a/drivers/platform/cznic/Makefile b/drivers/platform/cznic/Makefile index eae4c6b341ff..7599b4835056 100644 --- a/drivers/platform/cznic/Makefile +++ b/drivers/platform/cznic/Makefile @@ -5,4 +5,4 @@ turris-omnia-mcu-y := turris-omnia-mcu-base.o turris-omnia-mcu-y += turris-omnia-mcu-gpio.o turris-omnia-mcu-y += turris-omnia-mcu-sys-off-wakeup.o turris-omnia-mcu-y += turris-omnia-mcu-trng.o -turris-omnia-mcu-y += turris-omnia-mcu-watchdog.o +turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_WATCHDOG) += turris-omnia-mcu-watchdog.o diff --git a/drivers/platform/cznic/turris-omnia-mcu.h b/drivers/platform/cznic/turris-omnia-mcu.h index 2ca56ae13aa9..85bf9ab39356 100644 --- a/drivers/platform/cznic/turris-omnia-mcu.h +++ b/drivers/platform/cznic/turris-omnia-mcu.h @@ -47,8 +47,10 @@ struct omnia_mcu { u32 rtc_alarm; bool front_button_poweron; +#ifdef CONFIG_TURRIS_OMNIA_MCU_WATCHDOG /* MCU watchdog */ struct watchdog_device wdt; +#endif /* true random number generator */ struct hwrng trng; @@ -189,6 +191,14 @@ extern const struct attribute_group omnia_mcu_poweroff_group; int omnia_mcu_register_gpiochip(struct omnia_mcu *mcu); int omnia_mcu_register_sys_off_and_wakeup(struct omnia_mcu *mcu); int omnia_mcu_register_trng(struct omnia_mcu *mcu); + +#ifdef CONFIG_TURRIS_OMNIA_MCU_WATCHDOG int omnia_mcu_register_watchdog(struct omnia_mcu *mcu); +#else +static inline int omnia_mcu_register_watchdog(struct omnia_mcu *mcu) +{ + return 0; +} +#endif #endif /* __TURRIS_OMNIA_MCU_H */ From c7da0d4e33ce262dbed7b9ae4cf013aad0f541f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 19 Jul 2024 10:57:54 +0200 Subject: [PATCH 0572/1052] platform: cznic: turris-omnia-mcu: Make TRNG code optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the TRNG part of the driver optional, under a boolean config option. This makes the driver turris-omnia-mcu available for compilation even if HW_RANDOM is disabled. Fixes: ed46f1f7731d ("platform: cznic: turris-omnia-mcu: fix Kconfig dependencies") Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20240719085756.30598-3-kabel@kernel.org Signed-off-by: Arnd Bergmann --- drivers/platform/cznic/Kconfig | 10 ++++++++-- drivers/platform/cznic/Makefile | 2 +- drivers/platform/cznic/turris-omnia-mcu.h | 10 ++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index b56c343e21d6..98f17562646e 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -18,7 +18,6 @@ config TURRIS_OMNIA_MCU depends on I2C depends on OF depends on GPIOLIB - depends on HW_RANDOM depends on RTC_CLASS select GPIOLIB_IRQCHIP help @@ -28,7 +27,6 @@ config TURRIS_OMNIA_MCU - board poweroff into true low power mode (with voltage regulators disabled) and the ability to configure wake up from this mode (via rtcwake) - - true random number generator (if available on the MCU) - GPIO pins - to get front button press events (the front button can be configured either to generate press events to the CPU or to change @@ -56,6 +54,14 @@ config TURRIS_OMNIA_MCU_WATCHDOG Say Y here to add support for watchdog provided by CZ.NIC's Turris Omnia MCU. +config TURRIS_OMNIA_MCU_TRNG + bool "Turris Omnia MCU true random number generator" + default y + depends on HW_RANDOM + help + Say Y here to add support for the true random number generator + provided by CZ.NIC's Turris Omnia MCU. + endif # TURRIS_OMNIA_MCU endif # CZNIC_PLATFORMS diff --git a/drivers/platform/cznic/Makefile b/drivers/platform/cznic/Makefile index 7599b4835056..0c28fa859391 100644 --- a/drivers/platform/cznic/Makefile +++ b/drivers/platform/cznic/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_TURRIS_OMNIA_MCU) += turris-omnia-mcu.o turris-omnia-mcu-y := turris-omnia-mcu-base.o turris-omnia-mcu-y += turris-omnia-mcu-gpio.o turris-omnia-mcu-y += turris-omnia-mcu-sys-off-wakeup.o -turris-omnia-mcu-y += turris-omnia-mcu-trng.o +turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_TRNG) += turris-omnia-mcu-trng.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_WATCHDOG) += turris-omnia-mcu-watchdog.o diff --git a/drivers/platform/cznic/turris-omnia-mcu.h b/drivers/platform/cznic/turris-omnia-mcu.h index 85bf9ab39356..d07a32cfe238 100644 --- a/drivers/platform/cznic/turris-omnia-mcu.h +++ b/drivers/platform/cznic/turris-omnia-mcu.h @@ -52,9 +52,11 @@ struct omnia_mcu { struct watchdog_device wdt; #endif +#ifdef CONFIG_TURRIS_OMNIA_MCU_TRNG /* true random number generator */ struct hwrng trng; struct completion trng_entropy_ready; +#endif }; int omnia_cmd_write_read(const struct i2c_client *client, @@ -190,7 +192,15 @@ extern const struct attribute_group omnia_mcu_poweroff_group; int omnia_mcu_register_gpiochip(struct omnia_mcu *mcu); int omnia_mcu_register_sys_off_and_wakeup(struct omnia_mcu *mcu); + +#ifdef CONFIG_TURRIS_OMNIA_MCU_TRNG int omnia_mcu_register_trng(struct omnia_mcu *mcu); +#else +static inline int omnia_mcu_register_trng(struct omnia_mcu *mcu) +{ + return 0; +} +#endif #ifdef CONFIG_TURRIS_OMNIA_MCU_WATCHDOG int omnia_mcu_register_watchdog(struct omnia_mcu *mcu); From 74a22fced5a012c57f56d1cf7ea926cc366a2a3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 19 Jul 2024 10:57:55 +0200 Subject: [PATCH 0573/1052] platform: cznic: turris-omnia-mcu: Make poweroff and wakeup code optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the system poweroff and RTC wakeup part of the driver optional, under a boolean config option. Move the dependency to RTC_CLASS to this new option. This makes the turris-omnia-mcu driver available for compilation even if RTC_CLASS is disabled. Fixes: ed46f1f7731d ("platform: cznic: turris-omnia-mcu: fix Kconfig dependencies") Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20240719085756.30598-4-kabel@kernel.org Signed-off-by: Arnd Bergmann --- drivers/platform/cznic/Kconfig | 13 +++++++++---- drivers/platform/cznic/Makefile | 2 +- drivers/platform/cznic/turris-omnia-mcu-base.c | 2 ++ drivers/platform/cznic/turris-omnia-mcu.h | 13 +++++++++++-- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index 98f17562646e..f02856226dd7 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -18,15 +18,11 @@ config TURRIS_OMNIA_MCU depends on I2C depends on OF depends on GPIOLIB - depends on RTC_CLASS select GPIOLIB_IRQCHIP help Say Y here to add support for the features implemented by the microcontroller on the CZ.NIC's Turris Omnia SOHO router. The features include: - - board poweroff into true low power mode (with voltage regulators - disabled) and the ability to configure wake up from this mode (via - rtcwake) - GPIO pins - to get front button press events (the front button can be configured either to generate press events to the CPU or to change @@ -45,6 +41,15 @@ config TURRIS_OMNIA_MCU if TURRIS_OMNIA_MCU +config TURRIS_OMNIA_MCU_SYSOFF_WAKEUP + bool "Turris Omnia MCU system off and RTC wakeup" + default y + depends on RTC_CLASS + help + Say Y here to add support for CZ.NIC's Turris Omnia board poweroff + into true low power mode (with voltage regulators disabled) and the + ability to configure wake up from this mode (via rtcwake). + config TURRIS_OMNIA_MCU_WATCHDOG bool "Turris Omnia MCU watchdog" default y diff --git a/drivers/platform/cznic/Makefile b/drivers/platform/cznic/Makefile index 0c28fa859391..380530ba74f7 100644 --- a/drivers/platform/cznic/Makefile +++ b/drivers/platform/cznic/Makefile @@ -3,6 +3,6 @@ obj-$(CONFIG_TURRIS_OMNIA_MCU) += turris-omnia-mcu.o turris-omnia-mcu-y := turris-omnia-mcu-base.o turris-omnia-mcu-y += turris-omnia-mcu-gpio.o -turris-omnia-mcu-y += turris-omnia-mcu-sys-off-wakeup.o +turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP) += turris-omnia-mcu-sys-off-wakeup.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_TRNG) += turris-omnia-mcu-trng.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_WATCHDOG) += turris-omnia-mcu-watchdog.o diff --git a/drivers/platform/cznic/turris-omnia-mcu-base.c b/drivers/platform/cznic/turris-omnia-mcu-base.c index c68a7a84a951..7b514e60273d 100644 --- a/drivers/platform/cznic/turris-omnia-mcu-base.c +++ b/drivers/platform/cznic/turris-omnia-mcu-base.c @@ -198,7 +198,9 @@ static const struct attribute_group omnia_mcu_base_group = { static const struct attribute_group *omnia_mcu_groups[] = { &omnia_mcu_base_group, &omnia_mcu_gpio_group, +#ifdef CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP &omnia_mcu_poweroff_group, +#endif NULL }; diff --git a/drivers/platform/cznic/turris-omnia-mcu.h b/drivers/platform/cznic/turris-omnia-mcu.h index d07a32cfe238..75fa2111546f 100644 --- a/drivers/platform/cznic/turris-omnia-mcu.h +++ b/drivers/platform/cznic/turris-omnia-mcu.h @@ -42,10 +42,12 @@ struct omnia_mcu { unsigned long last_status; bool button_pressed_emul; +#ifdef CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP /* RTC device for configuring wake-up */ struct rtc_device *rtcdev; u32 rtc_alarm; bool front_button_poweron; +#endif #ifdef CONFIG_TURRIS_OMNIA_MCU_WATCHDOG /* MCU watchdog */ @@ -188,10 +190,17 @@ static inline int omnia_cmd_read_u8(const struct i2c_client *client, u8 cmd, extern const u8 omnia_int_to_gpio_idx[32]; extern const struct attribute_group omnia_mcu_gpio_group; -extern const struct attribute_group omnia_mcu_poweroff_group; - int omnia_mcu_register_gpiochip(struct omnia_mcu *mcu); + +#ifdef CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP +extern const struct attribute_group omnia_mcu_poweroff_group; int omnia_mcu_register_sys_off_and_wakeup(struct omnia_mcu *mcu); +#else +static inline int omnia_mcu_register_sys_off_and_wakeup(struct omnia_mcu *mcu) +{ + return 0; +} +#endif #ifdef CONFIG_TURRIS_OMNIA_MCU_TRNG int omnia_mcu_register_trng(struct omnia_mcu *mcu); From af340b7aa21c351ba08950f664af601888633614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 19 Jul 2024 10:57:56 +0200 Subject: [PATCH 0574/1052] platform: cznic: turris-omnia-mcu: Make GPIO code optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the GPIO part of the driver optional, under a boolean config option. Move the dependency to GPIOLIB and OF and the selection of GPIOLIB_IRQCHIP to this new option. This makes the turris-omnia-mcu driver available for compilation even if GPIOLIB or OF are disabled. Fixes: ed46f1f7731d ("platform: cznic: turris-omnia-mcu: fix Kconfig dependencies") Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20240719085756.30598-5-kabel@kernel.org Signed-off-by: Arnd Bergmann --- drivers/platform/cznic/Kconfig | 42 +++++++++++-------- drivers/platform/cznic/Makefile | 2 +- .../platform/cznic/turris-omnia-mcu-base.c | 2 + drivers/platform/cznic/turris-omnia-mcu.h | 9 ++++ 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index f02856226dd7..a111eca8ff57 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -16,31 +16,38 @@ config TURRIS_OMNIA_MCU tristate "Turris Omnia MCU driver" depends on MACH_ARMADA_38X || COMPILE_TEST depends on I2C - depends on OF - depends on GPIOLIB - select GPIOLIB_IRQCHIP help Say Y here to add support for the features implemented by the microcontroller on the CZ.NIC's Turris Omnia SOHO router. - The features include: - - GPIO pins - - to get front button press events (the front button can be - configured either to generate press events to the CPU or to change - front LEDs panel brightness) - - to enable / disable USB port voltage regulators and to detect - USB overcurrent - - to detect MiniPCIe / mSATA card presence in MiniPCIe port 0 - - to configure resets of various peripherals on board revisions 32+ - - to enable / disable the VHV voltage regulator to the SOC in order - to be able to program SOC's OTP on board revisions 32+ - - to get input from the LED output pins of the WAN ethernet PHY, LAN - switch and MiniPCIe ports - Other features can be enabled by subsequent config options. + This option only enables the core part of the driver. Specific + features can be enabled by subsequent config options. To compile this driver as a module, choose M here; the module will be called turris-omnia-mcu. if TURRIS_OMNIA_MCU +config TURRIS_OMNIA_MCU_GPIO + bool "Turris Omnia MCU GPIOs" + default y + depends on GPIOLIB + depends on OF + select GPIOLIB_IRQCHIP + help + Say Y here to add support for controlling MCU GPIO pins and receiving + MCU interrupts on CZ.NIC's Turris Omnia. + This enables you to + - get front button press events (the front button can be configured + either to generate press events to the CPU or to change front LEDs + panel brightness), + - enable / disable USB port voltage regulators and to detect USB + overcurrent, + - detect MiniPCIe / mSATA card presence in MiniPCIe port 0, + - configure resets of various peripherals on board revisions 32+, + - enable / disable the VHV voltage regulator to the SOC in order to be + able to program SOC's OTP on board revisions 32+, + - get input from the LED output pins of the WAN ethernet PHY, LAN + switch and MiniPCIe ports. + config TURRIS_OMNIA_MCU_SYSOFF_WAKEUP bool "Turris Omnia MCU system off and RTC wakeup" default y @@ -62,6 +69,7 @@ config TURRIS_OMNIA_MCU_WATCHDOG config TURRIS_OMNIA_MCU_TRNG bool "Turris Omnia MCU true random number generator" default y + depends on TURRIS_OMNIA_MCU_GPIO depends on HW_RANDOM help Say Y here to add support for the true random number generator diff --git a/drivers/platform/cznic/Makefile b/drivers/platform/cznic/Makefile index 380530ba74f7..ce6d997f34d6 100644 --- a/drivers/platform/cznic/Makefile +++ b/drivers/platform/cznic/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_TURRIS_OMNIA_MCU) += turris-omnia-mcu.o turris-omnia-mcu-y := turris-omnia-mcu-base.o -turris-omnia-mcu-y += turris-omnia-mcu-gpio.o +turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_GPIO) += turris-omnia-mcu-gpio.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP) += turris-omnia-mcu-sys-off-wakeup.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_TRNG) += turris-omnia-mcu-trng.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_WATCHDOG) += turris-omnia-mcu-watchdog.o diff --git a/drivers/platform/cznic/turris-omnia-mcu-base.c b/drivers/platform/cznic/turris-omnia-mcu-base.c index 7b514e60273d..58f9afae2867 100644 --- a/drivers/platform/cznic/turris-omnia-mcu-base.c +++ b/drivers/platform/cznic/turris-omnia-mcu-base.c @@ -197,7 +197,9 @@ static const struct attribute_group omnia_mcu_base_group = { static const struct attribute_group *omnia_mcu_groups[] = { &omnia_mcu_base_group, +#ifdef CONFIG_TURRIS_OMNIA_MCU_GPIO &omnia_mcu_gpio_group, +#endif #ifdef CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP &omnia_mcu_poweroff_group, #endif diff --git a/drivers/platform/cznic/turris-omnia-mcu.h b/drivers/platform/cznic/turris-omnia-mcu.h index 75fa2111546f..fed0d357fea3 100644 --- a/drivers/platform/cznic/turris-omnia-mcu.h +++ b/drivers/platform/cznic/turris-omnia-mcu.h @@ -33,6 +33,7 @@ struct omnia_mcu { u8 board_first_mac[ETH_ALEN]; u8 board_revision; +#ifdef CONFIG_TURRIS_OMNIA_MCU_GPIO /* GPIO chip */ struct gpio_chip gc; struct mutex lock; @@ -41,6 +42,7 @@ struct omnia_mcu { struct delayed_work button_release_emul_work; unsigned long last_status; bool button_pressed_emul; +#endif #ifdef CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP /* RTC device for configuring wake-up */ @@ -188,9 +190,16 @@ static inline int omnia_cmd_read_u8(const struct i2c_client *client, u8 cmd, return omnia_cmd_read(client, cmd, reply, sizeof(*reply)); } +#ifdef CONFIG_TURRIS_OMNIA_MCU_GPIO extern const u8 omnia_int_to_gpio_idx[32]; extern const struct attribute_group omnia_mcu_gpio_group; int omnia_mcu_register_gpiochip(struct omnia_mcu *mcu); +#else +static inline int omnia_mcu_register_gpiochip(struct omnia_mcu *mcu) +{ + return 0; +} +#endif #ifdef CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP extern const struct attribute_group omnia_mcu_poweroff_group; From a626ada4184b1888c1c5a4566071643f6e8081a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Jul 2024 16:49:23 +0200 Subject: [PATCH 0575/1052] doc: platform: cznic: turris-omnia-mcu: Fix sphinx-build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix sphinx-build warnings ERROR: Unexpected indentation. WARNING: Block quote ends without a blank line; unexpected unindent in Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/all/20240702174938.04c12aab@canb.auug.org.au/ Fixes: dfa556e45ae9 ("platform: cznic: turris-omnia-mcu: Add support for MCU connected GPIOs") Tested-by: Stephen Rothwell Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20240730144924.25552-2-kabel@kernel.org Signed-off-by: Arnd Bergmann --- .../ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu b/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu index 307a55f599cb..210a39043dc1 100644 --- a/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu +++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu @@ -32,9 +32,9 @@ Description: (RW) The front button on the Turris Omnia router can be interrupt. This file switches between these two modes: - - "mcu" makes the button press event be handled by the MCU to - change the LEDs panel intensity. - - "cpu" makes the button press event be handled by the CPU. + - "mcu" makes the button press event be handled by the MCU to + change the LEDs panel intensity. + - "cpu" makes the button press event be handled by the CPU. Format: %s. From e1793fea0350330a6a50721ecb2ad66846e0c51e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Jul 2024 16:49:24 +0200 Subject: [PATCH 0576/1052] doc: platform: cznic: turris-omnia-mcu: Use double backticks for attribute value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use double backticks instead of quotes for sysfs attribute value. This makes sphinx generate the "mcu" and "cpu" values in monospace when rendering to HTML. Fixes: dfa556e45ae9 ("platform: cznic: turris-omnia-mcu: Add support for MCU connected GPIOs") Tested-by: Stephen Rothwell Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20240730144924.25552-3-kabel@kernel.org Signed-off-by: Arnd Bergmann --- .../ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu b/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu index 210a39043dc1..35a8f6dae5bf 100644 --- a/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu +++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-turris-omnia-mcu @@ -32,9 +32,9 @@ Description: (RW) The front button on the Turris Omnia router can be interrupt. This file switches between these two modes: - - "mcu" makes the button press event be handled by the MCU to + - ``mcu`` makes the button press event be handled by the MCU to change the LEDs panel intensity. - - "cpu" makes the button press event be handled by the CPU. + - ``cpu`` makes the button press event be handled by the CPU. Format: %s. From cddaac0459c004c439510bd109929466b0d5908e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 6 Aug 2024 09:12:58 -0700 Subject: [PATCH 0577/1052] ARM: pxa/gumstix: fix attaching properties to vbus gpio device Commit f1d6588af93b tried to convert GPIO lookup tables to software properties for the vbus gpio device, bit forgot the most important step: actually attaching the new properties to the device. Also fix up the name of the property array to reflect the board name, and add missing gpio/property.h and devices.h includes absence of which causes compile failures on some configurations. Switch "#ifdef CONFIG_USB_PXA25X" to "#if IS_ENABLED(CONFIG_USB_PXA25X)" because it should not matter if the driver is buolt in or a module, it still need vbus controls. Reported-by: Arnd Bergmann Fixes: f1d6588af93b ("ARM: pxa/gumstix: convert vbus gpio to use software nodes") Signed-off-by: Dmitry Torokhov Signed-off-by: Arnd Bergmann --- arch/arm/mach-pxa/gumstix.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-pxa/gumstix.c b/arch/arm/mach-pxa/gumstix.c index efa6faa62a2c..1713bdf3b71e 100644 --- a/arch/arm/mach-pxa/gumstix.c +++ b/arch/arm/mach-pxa/gumstix.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include "udc.h" #include "gumstix.h" +#include "devices.h" #include "generic.h" @@ -99,8 +101,8 @@ static void __init gumstix_mmc_init(void) } #endif -#ifdef CONFIG_USB_PXA25X -static const struct property_entry spitz_mci_props[] __initconst = { +#if IS_ENABLED(CONFIG_USB_PXA25X) +static const struct property_entry gumstix_vbus_props[] __initconst = { PROPERTY_ENTRY_GPIO("vbus-gpios", &pxa2xx_gpiochip_node, GPIO_GUMSTIX_USB_GPIOn, GPIO_ACTIVE_HIGH), PROPERTY_ENTRY_GPIO("pullup-gpios", &pxa2xx_gpiochip_node, @@ -109,8 +111,9 @@ static const struct property_entry spitz_mci_props[] __initconst = { }; static const struct platform_device_info gumstix_gpio_vbus_info __initconst = { - .name = "gpio-vbus", - .id = PLATFORM_DEVID_NONE, + .name = "gpio-vbus", + .id = PLATFORM_DEVID_NONE, + .properties = gumstix_vbus_props, }; static void __init gumstix_udc_init(void) From c48b5a4cf3125adb679e28ef093f66ff81368d05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 6 Aug 2024 20:48:43 +0200 Subject: [PATCH 0578/1052] x86/mm: Fix PTI for i386 some more So it turns out that we have to do two passes of pti_clone_entry_text(), once before initcalls, such that device and late initcalls can use user-mode-helper / modprobe and once after free_initmem() / mark_readonly(). Now obviously mark_readonly() can cause PMD splits, and pti_clone_pgtable() doesn't like that much. Allow the late clone to split PMDs so that pagetables stay in sync. [peterz: Changelog and comments] Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Guenter Roeck Link: https://lkml.kernel.org/r/20240806184843.GX37996@noisy.programming.kicks-ass.net --- arch/x86/mm/pti.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index bfdf5f45b137..851ec8f1363a 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) * * Returns a pointer to a PTE on success, or NULL on failure. */ -static pte_t *pti_user_pagetable_walk_pte(unsigned long address) +static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pmd_t *pmd; @@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address) if (!pmd) return NULL; - /* We can't do anything sensible if we hit a large mapping. */ + /* Large PMD mapping found */ if (pmd_leaf(*pmd)) { - WARN_ON(1); - return NULL; + /* Clear the PMD if we hit a large mapping from the first round */ + if (late_text) { + set_pmd(pmd, __pmd(0)); + } else { + WARN_ON_ONCE(1); + return NULL; + } } if (pmd_none(*pmd)) { @@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void) if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) return; - target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); if (WARN_ON(!target_pte)) return; @@ -301,7 +306,7 @@ enum pti_clone_level { static void pti_clone_pgtable(unsigned long start, unsigned long end, - enum pti_clone_level level) + enum pti_clone_level level, bool late_text) { unsigned long addr; @@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, return; /* Allocate PTE in the user page-table */ - target_pte = pti_user_pagetable_walk_pte(addr); + target_pte = pti_user_pagetable_walk_pte(addr, late_text); if (WARN_ON(!target_pte)) return; @@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void) phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); pte_t *target_pte; - target_pte = pti_user_pagetable_walk_pte(va); + target_pte = pti_user_pagetable_walk_pte(va, false); if (WARN_ON(!target_pte)) return; @@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void) start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); - pti_clone_pgtable(start, end, PTI_CLONE_PMD); + pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); } #endif /* CONFIG_X86_64 */ @@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void) /* * Clone the populated PMDs of the entry text and force it RO. */ -static void pti_clone_entry_text(void) +static void pti_clone_entry_text(bool late) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, - PTI_LEVEL_KERNEL_IMAGE); + PTI_LEVEL_KERNEL_IMAGE, late); } /* @@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); /* * pti_clone_pgtable() will set the global bit in any PMDs @@ -638,8 +643,15 @@ void __init pti_init(void) /* Undo all global bits from the init pagetables in head_64.S: */ pti_set_kernel_image_nonglobal(); + /* Replace some of the global bits just for shared entry text: */ - pti_clone_entry_text(); + /* + * This is very early in boot. Device and Late initcalls can do + * modprobe before free_initmem() and mark_readonly(). This + * pti_clone_entry_text() allows those user-mode-helpers to function, + * but notably the text is still RW. + */ + pti_clone_entry_text(false); pti_setup_espfix64(); pti_setup_vsyscall(); } @@ -656,10 +668,11 @@ void pti_finalize(void) if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* - * We need to clone everything (again) that maps parts of the - * kernel image. + * This is after free_initmem() (all initcalls are done) and we've done + * mark_readonly(). Text is now NX which might've split some PMDs + * relative to the early clone. */ - pti_clone_entry_text(); + pti_clone_entry_text(true); pti_clone_kernel_text(); debug_checkwx_user(); From edbbaae42a56f9a2b39c52ef2504dfb3fb0a7858 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Aug 2024 10:20:44 +0300 Subject: [PATCH 0579/1052] genirq/irqdesc: Honor caller provided affinity in alloc_desc() Currently, whenever a caller is providing an affinity hint for an interrupt, the allocation code uses it to calculate the node and copies the cpumask into irq_desc::affinity. If the affinity for the interrupt is not marked 'managed' then the startup of the interrupt ignores irq_desc::affinity and uses the system default affinity mask. Prevent this by setting the IRQD_AFFINITY_SET flag for the interrupt in the allocator, which causes irq_setup_affinity() to use irq_desc::affinity on interrupt startup if the mask contains an online CPU. [ tglx: Massaged changelog ] Fixes: 45ddcecbfa94 ("genirq: Use affinity hint in irqdesc allocation") Signed-off-by: Shay Drory Signed-off-by: Thomas Gleixner Cc: Link: https://lore.kernel.org/all/20240806072044.837827-1-shayd@nvidia.com --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 07e99c936ba5..1dee88ba0ae4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -530,6 +530,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } + flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; From f1cb9d5aefba07fc52b06b7bd5fdcd9ef91157b4 Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Tue, 6 Aug 2024 19:20:11 +0300 Subject: [PATCH 0580/1052] wifi: rtlwifi: rtl8192du: Initialise value32 in _rtl92du_init_queue_reserved_page GCC complains: In file included from include/linux/ieee80211.h:21, from include/net/mac80211.h:20, from drivers/net/wireless/realtek/rtlwifi/rtl8192du/../wifi.h:14, from drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c:4: In function 'u32p_replace_bits', inlined from '_rtl92du_init_queue_reserved_page.isra' at drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c:225:2: >> include/linux/bitfield.h:189:18: warning: 'value32' is used uninitialized [-Wuninitialized] Part of the variable is indeed left uninitialised. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408062100.DWhN0CYH-lkp@intel.com/ Fixes: e769c67105d3 ("wifi: rtlwifi: Add rtl8192du/hw.{c,h}") Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Kalle Valo Link: https://patch.msgid.link/2a808244-93d0-492c-b304-ae1974df5df9@gmail.com --- drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c index 700c6e2bcad1..ff458fb8514d 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c @@ -181,11 +181,11 @@ static void _rtl92du_init_queue_reserved_page(struct ieee80211_hw *hw, struct rtl_hal *rtlhal = rtl_hal(rtlpriv); u32 txqpagenum, txqpageunit; u32 txqremainingpage; + u32 value32 = 0; u32 numhq = 0; u32 numlq = 0; u32 numnq = 0; u32 numpubq; - u32 value32; if (rtlhal->macphymode != SINGLEMAC_SINGLEPHY) { numpubq = NORMAL_PAGE_NUM_PUBQ_92D_DUAL_MAC; From 25a7123579ecac9a89a7e5b8d8a580bee4b68acd Mon Sep 17 00:00:00 2001 From: Grzegorz Nitka Date: Mon, 15 Jul 2024 17:39:10 +0200 Subject: [PATCH 0581/1052] ice: Fix reset handler Synchronize OICR IRQ when preparing for reset to avoid potential race conditions between the reset procedure and OICR Fixes: 4aad5335969f ("ice: add individual interrupt allocation") Signed-off-by: Grzegorz Nitka Signed-off-by: Sergey Temerkhanov Reviewed-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 3de020020bc4..6f97ed471fe9 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -559,6 +559,8 @@ ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type) if (test_bit(ICE_PREPARED_FOR_RESET, pf->state)) return; + synchronize_irq(pf->oicr_irq.virq); + ice_unplug_aux_dev(pf); /* Notify VFs of impending reset */ From bca515d58367494d8699ab53c645b57b71fb4785 Mon Sep 17 00:00:00 2001 From: Grzegorz Nitka Date: Mon, 15 Jul 2024 17:39:11 +0200 Subject: [PATCH 0582/1052] ice: Skip PTP HW writes during PTP reset procedure Block HW write access for the driver while the device is in reset to avoid potential race condition and access to the PTP HW in non-nominal state which could lead to undesired effects Fixes: 4aad5335969f ("ice: add individual interrupt allocation") Signed-off-by: Grzegorz Nitka Co-developed-by: Karol Kolacinski Signed-off-by: Karol Kolacinski Signed-off-by: Sergey Temerkhanov Reviewed-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index e2786cc13286..ef2e858f49bb 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1477,6 +1477,10 @@ void ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup) /* Update cached link status for this port immediately */ ptp_port->link_up = linkup; + /* Skip HW writes if reset is in progress */ + if (pf->hw.reset_ongoing) + return; + switch (hw->ptp.phy_model) { case ICE_PHY_E810: /* Do not reconfigure E810 PHY */ From c181da18a7302c5de510fe975a3a333299c6e4b7 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Fri, 26 Jul 2024 06:19:28 -0400 Subject: [PATCH 0583/1052] ice: Fix incorrect assigns of FEC counts Commit ac21add2540e ("ice: Implement driver functionality to dump fec statistics") introduces obtaining FEC correctable and uncorrectable stats per netdev in ICE driver. Unfortunately the assignment of values to fec_stats structure has been done incorrectly. This commit fixes the assignments. Fixes: ac21add2540e ("ice: Implement driver functionality to dump fec statistics") Reviewed-by: Wojciech Drewek Signed-off-by: Mateusz Polchlopek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 8c990c976132..bc79ba974e49 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4673,10 +4673,10 @@ static int ice_get_port_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, if (err) return err; - fec_stats->uncorrectable_blocks.total = (fec_corr_high_val << 16) + - fec_corr_low_val; - fec_stats->corrected_blocks.total = (fec_uncorr_high_val << 16) + - fec_uncorr_low_val; + fec_stats->corrected_blocks.total = (fec_corr_high_val << 16) + + fec_corr_low_val; + fec_stats->uncorrectable_blocks.total = (fec_uncorr_high_val << 16) + + fec_uncorr_low_val; return 0; } From 568901e709d7fa564dfdc75816ea59fec65d20a0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0584/1052] tools/include: Sync uapi/asm-generic/unistd.h with the kernel sources And arch syscall tables to pick up changes from: b1e31c134a8a powerpc: restore some missing spu syscalls d3882564a77c syscalls: fix compat_sys_io_pgetevents_time64 usage 54233a425403 uretprobe: change syscall number, again 63ded110979b uprobe: Change uretprobe syscall scope and number 9142be9e6443 x86/syscall: Mark exit[_group] syscall handlers __noreturn 9aae1baa1c5d x86, arm: Add missing license tag to syscall tables files 5c28424e9a34 syscalls: Fix to add sys_uretprobe to syscall.tbl 190fec72df4a uprobe: Wire up uretprobe system call This should be used to beautify syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Arnd Bergmann Cc: linux-arch@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/asm-generic/unistd.h | 2 +- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 6 +++++- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 8 +++++--- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index a00d53d02723..5bf6148cac2b 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -737,7 +737,7 @@ __SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64) #define __NR_ppoll_time64 414 __SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64) #define __NR_io_pgetevents_time64 416 -__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +__SC_COMP(__NR_io_pgetevents_time64, sys_io_pgetevents, compat_sys_io_pgetevents_time64) #define __NR_recvmmsg_time64 417 __SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64) #define __NR_mq_timedsend_time64 418 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 3656f1ca7a21..ebae8415dfbb 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -230,8 +230,10 @@ 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64 179 64 pread64 sys_pread64 +179 spu pread64 sys_pread64 180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64 180 64 pwrite64 sys_pwrite64 +180 spu pwrite64 sys_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -246,6 +248,7 @@ 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit 191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead 191 64 readahead sys_readahead +191 spu readahead sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64 194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64 @@ -293,6 +296,7 @@ 232 nospu set_tid_address sys_set_tid_address 233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64 233 64 fadvise64 sys_fadvise64 +233 spu fadvise64 sys_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_ni_syscall 236 common epoll_create sys_epoll_create @@ -502,7 +506,7 @@ 412 32 utimensat_time64 sys_utimensat sys_utimensat 413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +416 32 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend 419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index bd0fee24ad10..01071182763e 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -418,7 +418,7 @@ 412 32 utimensat_time64 - sys_utimensat 413 32 pselect6_time64 - compat_sys_pselect6_time64 414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - sys_io_pgetevents +416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 - sys_mq_timedsend 419 32 mq_timedreceive_time64 - sys_mq_timedreceive diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index a396f6e6ab5b..7093ee21c0d1 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note # # 64-bit system call numbers and entry vectors # # The format is: -# +# [ [noreturn]] # # The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls # @@ -68,7 +69,7 @@ 57 common fork sys_fork 58 common vfork sys_vfork 59 64 execve sys_execve -60 common exit sys_exit +60 common exit sys_exit - noreturn 61 common wait4 sys_wait4 62 common kill sys_kill 63 common uname sys_newuname @@ -239,7 +240,7 @@ 228 common clock_gettime sys_clock_gettime 229 common clock_getres sys_clock_getres 230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group +231 common exit_group sys_exit_group - noreturn 232 common epoll_wait sys_epoll_wait 233 common epoll_ctl sys_epoll_ctl 234 common tgkill sys_tgkill @@ -343,6 +344,7 @@ 332 common statx sys_statx 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq +335 common uretprobe sys_uretprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal From ed86525f1f4b738bae75c73e89f25430bd0af1b0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0585/1052] tools/include: Sync network socket headers with the kernel sources To pick up changes from: d25a92ccae6b net/smc: Introduce IPPROTO_SMC 060f4ba6e403 io_uring/net: move charging socket out of zc io_uring bb6aaf736680 net: Split a __sys_listen helper for io_uring dc2e77979412 net: Split a __sys_bind helper for io_uring This should be used to beautify socket syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/in.h include/uapi/linux/in.h diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/in.h | 2 ++ tools/perf/trace/beauty/include/linux/socket.h | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index e682ab628dfa..d358add1611c 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -81,6 +81,8 @@ enum { #define IPPROTO_ETHERNET IPPROTO_ETHERNET IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_SMC = 256, /* Shared Memory Communications */ +#define IPPROTO_SMC IPPROTO_SMC IPPROTO_MPTCP = 262, /* Multipath TCP connection */ #define IPPROTO_MPTCP IPPROTO_MPTCP IPPROTO_MAX diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 89d16b90370b..df9cdb8bbfb8 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -76,7 +76,7 @@ struct msghdr { __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; - int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + int (*sg_from_iter)(struct sk_buff *skb, struct iov_iter *from, size_t length); }; @@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); +extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); +extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, From 845295f4004c7e1591bab4bad01b51f37d32272f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0586/1052] tools/include: Sync filesystem headers with the kernel sources To pick up changes from: 0f9ca80fa4f9 fs: Add initial atomic write support info to statx f9af549d1fd3 fs: export mount options via statmount() 0a3deb11858a fs: Allow listmount() in foreign mount namespace 09b31295f833 fs: export the mount ns id via statmount d04bccd8c19d listmount: allow listing in reverse order bfc69fd05ef9 fs/procfs: add build ID fetching to PROCMAP_QUERY API ed5d583a88a9 fs/procfs: implement efficient VMA querying API for /proc//maps This should be used to beautify FS syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/stat.h include/uapi/linux/stat.h diff -u tools/perf/trace/beauty/include/uapi/linux/fs.h include/uapi/linux/fs.h diff -u tools/perf/trace/beauty/include/uapi/linux/mount.h include/uapi/linux/mount.h diff -u tools/perf/trace/beauty/include/uapi/linux/stat.h include/uapi/linux/stat.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/stat.h | 12 +- .../perf/trace/beauty/include/uapi/linux/fs.h | 163 +++++++++++++++++- .../trace/beauty/include/uapi/linux/mount.h | 10 +- .../trace/beauty/include/uapi/linux/stat.h | 12 +- 4 files changed, 189 insertions(+), 8 deletions(-) diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index 67626d535316..887a25286441 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -126,9 +126,15 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index 45e4e64fd664..753971770733 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -329,12 +329,17 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO negation of O_APPEND */ #define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + +#define PROCFS_IOCTL_MAGIC 'f' /* Pagemap ioctl */ -#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) +#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg) /* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ #define PAGE_IS_WPALLOWED (1 << 0) @@ -393,4 +398,158 @@ struct pm_scan_arg { __u64 return_mask; }; +/* /proc//maps ioctl */ +#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query) + +enum procmap_query_flags { + /* + * VMA permission flags. + * + * Can be used as part of procmap_query.query_flags field to look up + * only VMAs satisfying specified subset of permissions. E.g., specifying + * PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs, + * while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only + * return read/write VMAs, though both executable/non-executable and + * private/shared will be ignored. + * + * PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags + * field to specify actual VMA permissions. + */ + PROCMAP_QUERY_VMA_READABLE = 0x01, + PROCMAP_QUERY_VMA_WRITABLE = 0x02, + PROCMAP_QUERY_VMA_EXECUTABLE = 0x04, + PROCMAP_QUERY_VMA_SHARED = 0x08, + /* + * Query modifier flags. + * + * By default VMA that covers provided address is returned, or -ENOENT + * is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest + * VMA with vma_start > addr will be returned if no covering VMA is + * found. + * + * PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that + * have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA + * to iterate all VMAs with file backing. + */ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10, + PROCMAP_QUERY_FILE_BACKED_VMA = 0x20, +}; + +/* + * Input/output argument structured passed into ioctl() call. It can be used + * to query a set of VMAs (Virtual Memory Areas) of a process. + * + * Each field can be one of three kinds, marked in a short comment to the + * right of the field: + * - "in", input argument, user has to provide this value, kernel doesn't modify it; + * - "out", output argument, kernel sets this field with VMA data; + * - "in/out", input and output argument; user provides initial value (used + * to specify maximum allowable buffer size), and kernel sets it to actual + * amount of data written (or zero, if there is no data). + * + * If matching VMA is found (according to criterias specified by + * query_addr/query_flags, all the out fields are filled out, and ioctl() + * returns 0. If there is no matching VMA, -ENOENT will be returned. + * In case of any other error, negative error code other than -ENOENT is + * returned. + * + * Most of the data is similar to the one returned as text in /proc//maps + * file, but procmap_query provides more querying flexibility. There are no + * consistency guarantees between subsequent ioctl() calls, but data returned + * for matched VMA is self-consistent. + */ +struct procmap_query { + /* Query struct size, for backwards/forward compatibility */ + __u64 size; + /* + * Query flags, a combination of enum procmap_query_flags values. + * Defines query filtering and behavior, see enum procmap_query_flags. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_flags; /* in */ + /* + * Query address. By default, VMA that covers this address will + * be looked up. PROCMAP_QUERY_* flags above modify this default + * behavior further. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_addr; /* in */ + /* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */ + __u64 vma_start; /* out */ + __u64 vma_end; /* out */ + /* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */ + __u64 vma_flags; /* out */ + /* VMA backing page size granularity. */ + __u64 vma_page_size; /* out */ + /* + * VMA file offset. If VMA has file backing, this specifies offset + * within the file that VMA's start address corresponds to. + * Is set to zero if VMA has no backing file. + */ + __u64 vma_offset; /* out */ + /* Backing file's inode number, or zero, if VMA has no backing file. */ + __u64 inode; /* out */ + /* Backing file's device major/minor number, or zero, if VMA has no backing file. */ + __u32 dev_major; /* out */ + __u32 dev_minor; /* out */ + /* + * If set to non-zero value, signals the request to return VMA name + * (i.e., VMA's backing file's absolute path, with " (deleted)" suffix + * appended, if file was unlinked from FS) for matched VMA. VMA name + * can also be some special name (e.g., "[heap]", "[stack]") or could + * be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME). + * + * Kernel will set this field to zero, if VMA has no associated name. + * Otherwise kernel will return actual amount of bytes filled in + * user-supplied buffer (see vma_name_addr field below), including the + * terminating zero. + * + * If VMA name is longer that user-supplied maximum buffer size, + * -E2BIG error is returned. + * + * If this field is set to non-zero value, vma_name_addr should point + * to valid user space memory buffer of at least vma_name_size bytes. + * If set to zero, vma_name_addr should be set to zero as well + */ + __u32 vma_name_size; /* in/out */ + /* + * If set to non-zero value, signals the request to extract and return + * VMA's backing file's build ID, if the backing file is an ELF file + * and it contains embedded build ID. + * + * Kernel will set this field to zero, if VMA has no backing file, + * backing file is not an ELF file, or ELF file has no build ID + * embedded. + * + * Build ID is a binary value (not a string). Kernel will set + * build_id_size field to exact number of bytes used for build ID. + * If build ID is requested and present, but needs more bytes than + * user-supplied maximum buffer size (see build_id_addr field below), + * -E2BIG error will be returned. + * + * If this field is set to non-zero value, build_id_addr should point + * to valid user space memory buffer of at least build_id_size bytes. + * If set to zero, build_id_addr should be set to zero as well + */ + __u32 build_id_size; /* in/out */ + /* + * User-supplied address of a buffer of at least vma_name_size bytes + * for kernel to fill with matched VMA's name (see vma_name_size field + * description above for details). + * + * Should be set to zero if VMA name should not be returned. + */ + __u64 vma_name_addr; /* in */ + /* + * User-supplied address of a buffer of at least build_id_size bytes + * for kernel to fill with matched VMA's ELF build ID, if available + * (see build_id_size field description above for details). + * + * Should be set to zero if build ID should not be returned. + */ + __u64 build_id_addr; /* in */ +}; + #endif /* _UAPI_LINUX_FS_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/mount.h b/tools/perf/trace/beauty/include/uapi/linux/mount.h index ad5478dbad00..225bc366ffcb 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/mount.h +++ b/tools/perf/trace/beauty/include/uapi/linux/mount.h @@ -154,7 +154,7 @@ struct mount_attr { */ struct statmount { __u32 size; /* Total size, including strings */ - __u32 __spare1; + __u32 mnt_opts; /* [str] Mount options of the mount */ __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; @@ -172,7 +172,8 @@ struct statmount { __u64 propagate_from; /* Propagation from in current namespace */ __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ - __u64 __spare2[50]; + __u64 mnt_ns_id; /* ID of the mount namespace */ + __u64 __spare2[49]; char str[]; /* Variable size part containing strings */ }; @@ -188,10 +189,12 @@ struct mnt_id_req { __u32 spare; __u64 mnt_id; __u64 param; + __u64 mnt_ns_id; }; /* List of all mnt_id_req versions. */ #define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ /* * @mask bits for statmount(2) @@ -202,10 +205,13 @@ struct mnt_id_req { #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ +#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ /* * Special @mnt_id values that can be passed to listmount */ #define LSMT_ROOT 0xffffffffffffffff /* root mount */ +#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h index 67626d535316..887a25286441 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/stat.h +++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h @@ -126,9 +126,15 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ From f6d9883f8e680460be4714d4d35c7acac1dffeaf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0587/1052] tools/include: Sync x86 headers with the kernel sources To pick up changes from: 149fd4712bcd perf/x86/intel: Support Perfmon MSRs aliasing 21b362cc762a x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems 4f460bff7b6a cpufreq: acpi: move MSR_K7_HWCR_CPB_DIS_BIT into msr-index.h 7ea81936b853 x86/cpufeatures: Add HWP highest perf change feature flag 78ce84b9e0a5 x86/cpufeatures: Flip the /proc/cpuinfo appearance logic 1beb348d5c7f x86/sev: Provide SVSM discovery support This should be used to beautify x86 syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim --- tools/arch/x86/include/asm/cpufeatures.h | 803 ++++++++++++----------- tools/arch/x86/include/asm/msr-index.h | 11 + 2 files changed, 414 insertions(+), 400 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 3c7434329661..dd4682857c12 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -18,170 +18,170 @@ /* * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. + * in /proc/cpuinfo instead of the macro name. Otherwise, this feature + * bit is not displayed in /proc/cpuinfo at all. * * When adding new features here that depend on other features, * please update the table in kernel/cpu/cpuid-deps.c as well. */ /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */ #define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */ #define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ #define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ #define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */ #define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */ +#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */ #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ -#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */ #define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ #define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */ +#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */ #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ #define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ #define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ #define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -189,93 +189,93 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */ -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ -#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ -#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ -#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ -#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ -#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ -#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ -#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ -#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ -#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ -#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */ +#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */ +#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */ +#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */ +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */ +#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ +#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */ +#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */ -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ -#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ -#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ -#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ -#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */ +#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */ +#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */ +#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */ +#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */ +#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */ +#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -283,181 +283,183 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ -#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ -#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ -#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ -#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ -#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ -#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ -#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ -#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ -#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ -#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ -#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ -#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ -#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ -#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ -#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ -#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ -#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ -#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ -#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ -#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ -#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ -#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ -#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ -#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ -#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ -#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ -#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */ +#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */ +#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */ +#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */ +#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */ +#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */ +#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */ +#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */ +#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ -#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ -#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ -#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ -#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */ -#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ -#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ -#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ -#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ -#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ -#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ -#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ -#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ -#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ +#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */ +#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ +#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ +#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */ -#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ -#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ -#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ -#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ -#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */ -#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ -#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ -#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ -#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ -#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ -#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ -#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ +#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ +#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ +#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */ +#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */ +#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */ +#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */ #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ #define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ #define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ #define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ -#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ -#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ -#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */ -#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ +#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ -#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */ -#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ -#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ -#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ -#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ -#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ -#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ -#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */ +#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */ +#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */ +#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */ +#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */ +#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */ +#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ -#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ -#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ -#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ -#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */ -#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ -#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ -#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ -#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ -#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ -#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ -#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */ -#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ -#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ -#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ -#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ -#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ -#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ -#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */ +#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */ +#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */ +#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */ +#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */ +#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */ +#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */ +#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */ /* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ -#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ -#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ -#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ -#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ -#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ -#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ -#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ -#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ -#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ -#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ -#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ -#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ +#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ +#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ +#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ -#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */ -#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ -#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ +#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ +#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -465,59 +467,60 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ -#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ -#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ -#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ -#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ /* * BUG word(s) */ #define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */ #define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ #define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ -#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ -#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ -#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ -#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ -#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ -#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ -#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ -#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ -#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ -#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ -#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ -#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ -#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ -#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ -#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */ +#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */ +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */ +#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */ +#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */ +#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */ /* BUG word 2 */ -#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ -#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ -#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ -#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ +#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* "srso" AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index e022e6eb766c..82c6a4d350e0 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -566,6 +566,12 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + /* KeyID partitioning between MKTME and TDX */ #define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 @@ -660,6 +666,8 @@ #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 +#define MSR_SVSM_CAA 0xc001f000 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -781,6 +789,8 @@ #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 +#define MSR_K7_HWCR_CPB_DIS_BIT 25 +#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT) /* K6 MSRs */ #define MSR_K6_WHCR 0xc0000082 @@ -1164,6 +1174,7 @@ #define MSR_IA32_QM_CTR 0xc8e #define MSR_IA32_PQR_ASSOC 0xc8f #define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_RMID_SNC_CONFIG 0xca0 #define MSR_IA32_L2_CBM_BASE 0xd10 #define MSR_IA32_MBA_THRTL_BASE 0xd50 From d5b854893d27b4030943a10cf28a07189aab0c36 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 0588/1052] tools/include: Sync arm64 headers with the kernel sources To pick up changes from: 9ef54a384526 arm64: cputype: Add Cortex-A725 definitions 58d245e03c32 arm64: cputype: Add Cortex-X1C definitions fd2ff5f0b320 arm64: cputype: Add Cortex-X925 definitions add332c40328 arm64: cputype: Add Cortex-A720 definitions be5a6f238700 arm64: cputype: Add Cortex-X3 definitions This should be used to beautify x86 syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim --- tools/arch/arm64/include/asm/cputype.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 7b32b99023a2..5fd7caea4419 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -86,9 +86,14 @@ #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define ARM_CPU_PART_CORTEX_A78C 0xD4B +#define ARM_CPU_PART_CORTEX_X1C 0xD4C +#define ARM_CPU_PART_CORTEX_X3 0xD4E #define ARM_CPU_PART_NEOVERSE_V2 0xD4F +#define ARM_CPU_PART_CORTEX_A720 0xD81 #define ARM_CPU_PART_CORTEX_X4 0xD82 #define ARM_CPU_PART_NEOVERSE_V3 0xD84 +#define ARM_CPU_PART_CORTEX_X925 0xD85 +#define ARM_CPU_PART_CORTEX_A725 0xD87 #define APM_CPU_PART_XGENE 0x000 #define APM_CPU_VAR_POTENZA 0x00 @@ -162,9 +167,14 @@ #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) +#define MIDR_CORTEX_X3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X3) #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) +#define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720) #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) +#define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) +#define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) From ab84ba647f2c94ac4d0c3fc6951c49f08aa1fcf7 Mon Sep 17 00:00:00 2001 From: Zhiquan Li Date: Mon, 5 Aug 2024 18:35:31 +0800 Subject: [PATCH 0589/1052] x86/acpi: Remove __ro_after_init from acpi_mp_wake_mailbox On a platform using the "Multiprocessor Wakeup Structure"[1] to startup secondary CPUs the control processor needs to memremap() the physical address of the MP Wakeup Structure mailbox to the variable acpi_mp_wake_mailbox, which holds the virtual address of mailbox. To wake up the AP the control processor writes the APIC ID of AP, the wakeup vector and the ACPI_MP_WAKE_COMMAND_WAKEUP command into the mailbox. Current implementation doesn't consider the case which restricts boot time CPU bringup to 1 with the kernel parameter "maxcpus=1" and brings other CPUs online later from user space as it sets acpi_mp_wake_mailbox to read-only after init. So when the first AP is tried to brought online after init, the attempt to update the variable results in a kernel panic. The memremap() call that initializes the variable cannot be moved into acpi_parse_mp_wake() because memremap() is not functional at that point in the boot process. Also as the APs might never be brought up, keep the memremap() call in acpi_wakeup_cpu() so that the operation only takes place when needed. Fixes: 24dd05da8c79 ("x86/apic: Mark acpi_mp_wake_* variables as __ro_after_init") Signed-off-by: Zhiquan Li Signed-off-by: Thomas Gleixner Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20240805103531.1230635-1-zhiquan1.li@intel.com --- arch/x86/kernel/acpi/madt_wakeup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c index 6cfe762be28b..d5ef6215583b 100644 --- a/arch/x86/kernel/acpi/madt_wakeup.c +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -19,7 +19,7 @@ static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ -static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init; +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; static u64 acpi_mp_pgd __ro_after_init; static u64 acpi_mp_reset_vector_paddr __ro_after_init; From e639222a51196c69c70b49b67098ce2f9919ed08 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 6 Aug 2024 19:22:07 +0800 Subject: [PATCH 0590/1052] x86/paravirt: Fix incorrect virt spinlock setting on bare metal The kernel can change spinlock behavior when running as a guest. But this guest-friendly behavior causes performance problems on bare metal. The kernel uses a static key to switch between the two modes. In theory, the static key is enabled by default (run in guest mode) and should be disabled for bare metal (and in some guests that want native behavior or paravirt spinlock). A performance drop is reported when running encode/decode workload and BenchSEE cache sub-workload. Bisect points to commit ce0a1b608bfc ("x86/paravirt: Silence unused native_pv_lock_init() function warning"). When CONFIG_PARAVIRT_SPINLOCKS is disabled the virt_spin_lock_key is incorrectly set to true on bare metal. The qspinlock degenerates to test-and-set spinlock, which decreases the performance on bare metal. Set the default value of virt_spin_lock_key to false. If booting in a VM, enable this key. Later during the VM initialization, if other high-efficient spinlock is preferred (e.g. paravirt-spinlock), or the user wants the native qspinlock (via nopvspin boot commandline), the virt_spin_lock_key is disabled accordingly. This results in the following decision matrix: X86_FEATURE_HYPERVISOR Y Y Y N CONFIG_PARAVIRT_SPINLOCKS Y Y N Y/N PV spinlock Y N N Y/N virt_spin_lock_key N Y/N Y N Fixes: ce0a1b608bfc ("x86/paravirt: Silence unused native_pv_lock_init() function warning") Reported-by: Prem Nath Dey Reported-by: Xiaoping Zhou Suggested-by: Dave Hansen Suggested-by: Qiuxu Zhuo Suggested-by: Nikolay Borisov Signed-off-by: Chen Yu Signed-off-by: Thomas Gleixner Reviewed-by: Nikolay Borisov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240806112207.29792-1-yu.c.chen@intel.com --- arch/x86/include/asm/qspinlock.h | 12 +++++++----- arch/x86/kernel/paravirt.c | 7 +++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index a053c1293975..68da67df304d 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -66,13 +66,15 @@ static inline bool vcpu_is_preempted(long cpu) #ifdef CONFIG_PARAVIRT /* - * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. * - * Native (and PV wanting native due to vCPU pinning) should disable this key. - * It is done in this backwards fashion to only have a single direction change, - * which removes ordering between native_pv_spin_init() and HV setup. + * Native (and PV wanting native due to vCPU pinning) should keep this key + * disabled. Native does not touch the key. + * + * When in a guest then native_pv_lock_init() enables the key first and + * KVM/XEN might conditionally disable it later in the boot process again. */ -DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); /* * Shortcut for the queued_spin_lock_slowpath() function that allows diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5358d43886ad..fec381533555 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -51,13 +51,12 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif -DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && - !boot_cpu_has(X86_FEATURE_HYPERVISOR)) - static_branch_disable(&virt_spin_lock_key); + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_enable(&virt_spin_lock_key); } static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) From fa4e5afa9758e6567a6a54ea7bf53d570ec94881 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 6 Aug 2024 16:09:02 -0700 Subject: [PATCH 0591/1052] bpf: Move bpf_get_file_xattr to fs/bpf_fs_kfuncs.c We are putting all fs kfuncs in fs/bpf_fs_kfuncs.c. Move existing bpf_get_file_xattr to it. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240806230904.71194-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- fs/bpf_fs_kfuncs.c | 38 ++++++++++++++++++++++ kernel/trace/bpf_trace.c | 68 ---------------------------------------- 2 files changed, 38 insertions(+), 68 deletions(-) diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 1e6e08667758..b13d00f7ad2b 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -8,6 +8,7 @@ #include #include #include +#include __bpf_kfunc_start_defs(); @@ -92,6 +93,42 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) return len; } +/** + * bpf_get_file_xattr - get xattr of a file + * @file: file to get xattr from + * @name__str: name of the xattr + * @value_p: output buffer of the xattr value + * + * Get xattr *name__str* of *file* and store the output in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "user." is allowed. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, + struct bpf_dynptr *value_p) +{ + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + struct dentry *dentry; + u32 value_len; + void *value; + int ret; + + if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EPERM; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data_rw(value_ptr, value_len); + if (!value) + return -EINVAL; + + dentry = file_dentry(file); + ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); + if (ret) + return ret; + return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) @@ -99,6 +136,7 @@ BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cd098846e251..d557bb11e0ff 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -24,7 +24,6 @@ #include #include #include -#include #include @@ -1439,73 +1438,6 @@ static int __init bpf_key_sig_kfuncs_init(void) late_initcall(bpf_key_sig_kfuncs_init); #endif /* CONFIG_KEYS */ -/* filesystem kfuncs */ -__bpf_kfunc_start_defs(); - -/** - * bpf_get_file_xattr - get xattr of a file - * @file: file to get xattr from - * @name__str: name of the xattr - * @value_p: output buffer of the xattr value - * - * Get xattr *name__str* of *file* and store the output in *value_ptr*. - * - * For security reasons, only *name__str* with prefix "user." is allowed. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, - struct bpf_dynptr *value_p) -{ - struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; - struct dentry *dentry; - u32 value_len; - void *value; - int ret; - - if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EPERM; - - value_len = __bpf_dynptr_size(value_ptr); - value = __bpf_dynptr_data_rw(value_ptr, value_len); - if (!value) - return -EINVAL; - - dentry = file_dentry(file); - ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); - if (ret) - return ret; - return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_KFUNCS_END(fs_kfunc_set_ids) - -static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) -{ - if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id)) - return 0; - - /* Only allow to attach from LSM hooks, to avoid recursion */ - return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; -} - -static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { - .owner = THIS_MODULE, - .set = &fs_kfunc_set_ids, - .filter = bpf_get_file_xattr_filter, -}; - -static int __init bpf_fs_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); -} - -late_initcall(bpf_fs_kfuncs_init); - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { From ac13a4261afe81ca423eddd8e6571078fe2a7cea Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 6 Aug 2024 16:09:03 -0700 Subject: [PATCH 0592/1052] bpf: Add kfunc bpf_get_dentry_xattr() to read xattr from dentry This kfunc can be used in LSM hooks with dentry, such as: security_inode_listxattr security_inode_permission and many more. Acked-by: Christian Brauner Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240806230904.71194-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- fs/bpf_fs_kfuncs.c | 56 +++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index b13d00f7ad2b..3fe9f59ef867 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -93,6 +93,44 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) return len; } +/** + * bpf_get_dentry_xattr - get xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * @value_p: output buffer of the xattr value + * + * Get xattr *name__str* of *dentry* and store the output in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "user." is allowed. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__str, + struct bpf_dynptr *value_p) +{ + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + struct inode *inode = d_inode(dentry); + u32 value_len; + void *value; + int ret; + + if (WARN_ON(!inode)) + return -EINVAL; + + if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EPERM; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data_rw(value_ptr, value_len); + if (!value) + return -EINVAL; + + ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); + if (ret) + return ret; + return __vfs_getxattr(dentry, inode, name__str, value, value_len); +} + /** * bpf_get_file_xattr - get xattr of a file * @file: file to get xattr from @@ -108,25 +146,10 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, struct bpf_dynptr *value_p) { - struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; struct dentry *dentry; - u32 value_len; - void *value; - int ret; - - if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EPERM; - - value_len = __bpf_dynptr_size(value_ptr); - value = __bpf_dynptr_data_rw(value_ptr, value_len); - if (!value) - return -EINVAL; dentry = file_dentry(file); - ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); - if (ret) - return ret; - return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); + return bpf_get_dentry_xattr(dentry, name__str, value_p); } __bpf_kfunc_end_defs(); @@ -136,6 +159,7 @@ BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) From 8681156c0939a7511c47b9cc462390a83f0e846a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 6 Aug 2024 16:09:04 -0700 Subject: [PATCH 0593/1052] selftests/bpf: Add tests for bpf_get_dentry_xattr Add test for bpf_get_dentry_xattr on hook security_inode_getxattr. Verify that the kfunc can read the xattr. Also test failing getxattr from user space by returning non-zero from the LSM bpf program. Acked-by: Christian Brauner Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240806230904.71194-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_kfuncs.h | 11 +++++- .../selftests/bpf/prog_tests/fs_kfuncs.c | 9 ++++- .../selftests/bpf/progs/test_get_xattr.c | 37 ++++++++++++++++--- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 3b6675ab4086..2eb3483f2fb0 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -45,7 +45,7 @@ extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clo /* Description * Modify the address of a AF_UNIX sockaddr. - * Returns__bpf_kfunc + * Returns * -EINVAL if the address size is too big or, 0 if the sockaddr was successfully modified. */ extern int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, @@ -78,4 +78,13 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, extern bool bpf_session_is_return(void) __ksym __weak; extern __u64 *bpf_session_cookie(void) __ksym __weak; + +struct dentry; +/* Description + * Returns xattr of a dentry + * Returns + * Error code + */ +extern int bpf_get_dentry_xattr(struct dentry *dentry, const char *name, + struct bpf_dynptr *value_ptr) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c index 37056ba73847..5a0b51157451 100644 --- a/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c @@ -16,6 +16,7 @@ static void test_xattr(void) { struct test_get_xattr *skel = NULL; int fd = -1, err; + int v[32]; fd = open(testfile, O_CREAT | O_RDONLY, 0644); if (!ASSERT_GE(fd, 0, "create_file")) @@ -50,7 +51,13 @@ static void test_xattr(void) if (!ASSERT_GE(fd, 0, "open_file")) goto out; - ASSERT_EQ(skel->bss->found_xattr, 1, "found_xattr"); + ASSERT_EQ(skel->bss->found_xattr_from_file, 1, "found_xattr_from_file"); + + /* Trigger security_inode_getxattr */ + err = getxattr(testfile, "user.kfuncs", v, sizeof(v)); + ASSERT_EQ(err, -1, "getxattr_return"); + ASSERT_EQ(errno, EINVAL, "getxattr_errno"); + ASSERT_EQ(skel->bss->found_xattr_from_dentry, 1, "found_xattr_from_dentry"); out: close(fd); diff --git a/tools/testing/selftests/bpf/progs/test_get_xattr.c b/tools/testing/selftests/bpf/progs/test_get_xattr.c index 7eb2a4e5a3e5..66e737720f7c 100644 --- a/tools/testing/selftests/bpf/progs/test_get_xattr.c +++ b/tools/testing/selftests/bpf/progs/test_get_xattr.c @@ -2,6 +2,7 @@ /* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ #include "vmlinux.h" +#include #include #include #include "bpf_kfuncs.h" @@ -9,10 +10,12 @@ char _license[] SEC("license") = "GPL"; __u32 monitored_pid; -__u32 found_xattr; +__u32 found_xattr_from_file; +__u32 found_xattr_from_dentry; static const char expected_value[] = "hello"; -char value[32]; +char value1[32]; +char value2[32]; SEC("lsm.s/file_open") int BPF_PROG(test_file_open, struct file *f) @@ -25,13 +28,37 @@ int BPF_PROG(test_file_open, struct file *f) if (pid != monitored_pid) return 0; - bpf_dynptr_from_mem(value, sizeof(value), 0, &value_ptr); + bpf_dynptr_from_mem(value1, sizeof(value1), 0, &value_ptr); ret = bpf_get_file_xattr(f, "user.kfuncs", &value_ptr); if (ret != sizeof(expected_value)) return 0; - if (bpf_strncmp(value, ret, expected_value)) + if (bpf_strncmp(value1, ret, expected_value)) return 0; - found_xattr = 1; + found_xattr_from_file = 1; return 0; } + +SEC("lsm.s/inode_getxattr") +int BPF_PROG(test_inode_getxattr, struct dentry *dentry, char *name) +{ + struct bpf_dynptr value_ptr; + __u32 pid; + int ret; + + pid = bpf_get_current_pid_tgid() >> 32; + if (pid != monitored_pid) + return 0; + + bpf_dynptr_from_mem(value2, sizeof(value2), 0, &value_ptr); + + ret = bpf_get_dentry_xattr(dentry, "user.kfuncs", &value_ptr); + if (ret != sizeof(expected_value)) + return 0; + if (bpf_strncmp(value2, ret, expected_value)) + return 0; + found_xattr_from_dentry = 1; + + /* return non-zero to fail getxattr from user space */ + return -EINVAL; +} From 10f2ad032defe906240d0c3b62dcbceace96b230 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 7 Aug 2024 12:51:44 +0100 Subject: [PATCH 0594/1052] KVM: arm64: Enforce dependency on an ARMv8.4-aware toolchain With the NV support of TLBI-range operations, KVM makes use of instructions that are only supported by binutils versions >= 2.30. This breaks the build for very old toolchains. Make KVM support conditional on having ARMv8.4 support in the assembler, side-stepping the issue. Fixes: 5d476ca57d7d ("KVM: arm64: nv: Add handling of range-based TLBI operations") Reported-by: Viresh Kumar Suggested-by: Arnd Bergmann Signed-off-by: Marc Zyngier Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240807115144.3237260-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 58f09370d17e..8304eb342be9 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -19,6 +19,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" + depends on AS_HAS_ARMV8_4 select KVM_COMMON select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER From 01ab08cafeced7ae1d6c01a08218742c8182f8da Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 7 Aug 2024 13:20:24 +0800 Subject: [PATCH 0595/1052] KVM: arm64: vgic-debug: Exit the iterator properly w/o LPI In case the guest doesn't have any LPI, we previously relied on the iterator setting 'intid = nr_spis + VGIC_NR_PRIVATE_IRQS' && 'lpi_idx = 1' to exit the iterator. But it was broken with commit 85d3ccc8b75b ("KVM: arm64: vgic-debug: Use an xarray mark for debug iterator") -- the intid remains at 'nr_spis + VGIC_NR_PRIVATE_IRQS - 1', and we end up endlessly printing the last SPI's state. Consider that it's meaningless to search the LPI xarray and populate lpi_idx when there is no LPI, let's just skip the process for that case. The result is that * If there's no LPI, we focus on the intid and exit the iterator when it runs out of the valid SPI range. * Otherwise we keep the current logic and let the xarray drive the iterator. Fixes: 85d3ccc8b75b ("KVM: arm64: vgic-debug: Use an xarray mark for debug iterator") Signed-off-by: Zenghui Yu Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240807052024.2084-1-yuzenghui@huawei.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index bcbc8c986b1d..bc74d06398ef 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -45,7 +45,8 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) * Let the xarray drive the iterator after the last SPI, as the iterator * has exhausted the sequentially-allocated INTID space. */ - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1)) { + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && + iter->nr_lpis) { if (iter->lpi_idx < iter->nr_lpis) xa_find_after(&dist->lpi_xa, &iter->intid, VGIC_LPI_MAX_INTID, @@ -112,7 +113,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - iter->lpi_idx > iter->nr_lpis; + (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) From 7e814a20f6da2bd2044b1a4682dd92a6f0df5a92 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 22 Jul 2024 17:33:11 +0100 Subject: [PATCH 0596/1052] KVM: arm64: Tidying up PAuth code in KVM Tidy up some of the PAuth trapping code to clear up some comments and avoid clang/checkpatch warnings. Also, don't bother setting PAuth HCR_EL2 bits in pKVM, since it's handled by the hypervisor. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240722163311.1493879-1-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_ptrauth.h | 2 +- arch/arm64/kvm/arm.c | 14 ++++---------- arch/arm64/kvm/hyp/include/hyp/switch.h | 1 - arch/arm64/kvm/hyp/nvhe/switch.c | 5 ++--- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h index d81bac256abc..6199c9f7ec6e 100644 --- a/arch/arm64/include/asm/kvm_ptrauth.h +++ b/arch/arm64/include/asm/kvm_ptrauth.h @@ -104,7 +104,7 @@ alternative_else_nop_endif #define __ptrauth_save_key(ctxt, key) \ do { \ - u64 __val; \ + u64 __val; \ __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \ ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \ __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 23e1fa56c02d..9bef7638342e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -522,10 +522,10 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) { - if (vcpu_has_ptrauth(vcpu)) { + if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) { /* - * Either we're running running an L2 guest, and the API/APK - * bits come from L1's HCR_EL2, or API/APK are both set. + * Either we're running an L2 guest, and the API/APK bits come + * from L1's HCR_EL2, or API/APK are both set. */ if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) { u64 val; @@ -542,16 +542,10 @@ static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) * Save the host keys if there is any chance for the guest * to use pauth, as the entry code will reload the guest * keys in that case. - * Protected mode is the exception to that rule, as the - * entry into the EL2 code eagerly switch back and forth - * between host and hyp keys (and kvm_hyp_ctxt is out of - * reach anyway). */ - if (is_protected_kvm_enabled()) - return; - if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) { struct kvm_cpu_context *ctxt; + ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt); ptrauth_save_keys(ctxt); } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f59ccfe11ab9..37ff87d782b6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6af179c6356d..8f5c56d5b1cd 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -173,9 +173,8 @@ static void __pmu_switch_to_host(struct kvm_vcpu *vcpu) static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) { /* - * Make sure we handle the exit for workarounds and ptrauth - * before the pKVM handling, as the latter could decide to - * UNDEF. + * Make sure we handle the exit for workarounds before the pKVM + * handling, as the latter could decide to UNDEF. */ return (kvm_hyp_handle_sysreg(vcpu, exit_code) || kvm_handle_pvm_sysreg(vcpu, exit_code)); From ad518452fd263766946346324810f14bd8bb8b34 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 31 Jul 2024 17:21:13 +0100 Subject: [PATCH 0597/1052] KVM: selftests: arm64: Correct feature test for S1PIE in get-reg-list The ID register for S1PIE is ID_AA64MMFR3_EL1.S1PIE which is bits 11:8 but get-reg-list uses a shift of 4, checking SCTLRX instead. Use a shift of 8 instead. Fixes: 5f0419a0083b ("KVM: selftests: get-reg-list: add Permission Indirection registers") Signed-off-by: Mark Brown Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20240731-kvm-arm64-fix-s1pie-test-v1-1-a9253f3b7db4@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 709d7d721760..4abebde78187 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -32,13 +32,13 @@ static struct feature_id_reg feat_id_regs[] = { { ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ - 4, + 8, 1 }, { ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ - 4, + 8, 1 } }; From 6dd1e4c045afa6a4ba5d46f044c83bd357c593c2 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 7 Aug 2024 17:00:56 +0800 Subject: [PATCH 0598/1052] selinux: add the processing of the failure of avc_add_xperms_decision() When avc_add_xperms_decision() fails, the information recorded by the new avc node is incomplete. In this case, the new avc node should be released instead of replacing the old avc node. Cc: stable@vger.kernel.org Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") Suggested-by: Stephen Smalley Signed-off-by: Zhen Lei Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/avc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 7087cd2b802d..b49c44869dc4 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -907,7 +907,11 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, node->ae.avd.auditdeny &= ~perms; break; case AVC_CALLBACK_ADD_XPERMS: - avc_add_xperms_decision(node, xpd); + rc = avc_add_xperms_decision(node, xpd); + if (rc) { + avc_node_kill(node); + goto out_unlock; + } break; } avc_node_replace(node, orig); From e037a26ead187901f83cad9c503ccece5ff6817a Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sat, 6 Jul 2024 11:38:07 -0400 Subject: [PATCH 0599/1052] igc: Fix packet still tx after gate close by reducing i226 MAC retry buffer Testing uncovered that even when the taprio gate is closed, some packets still transmit. According to i225/6 hardware errata [1], traffic might overflow the planned QBV window. This happens because MAC maintains an internal buffer, primarily for supporting half duplex retries. Therefore, even when the gate closes, residual MAC data in the buffer may still transmit. To mitigate this for i226, reduce the MAC's internal buffer from 192 bytes to the recommended 88 bytes by modifying the RETX_CTL register value. This follows guidelines from: [1] Ethernet Controller I225/I22 Spec Update Rev 2.1 Errata Item 9: TSN: Packet Transmission Might Cross Qbv Window [2] I225/6 SW User Manual Rev 1.2.4: Section 8.11.5 Retry Buffer Control Note that the RETX_CTL register can't be used in TSN mode because half duplex feature cannot coexist with TSN. Test Steps: 1. Send taprio cmd to board A: tc qdisc replace dev enp1s0 parent root handle 100 taprio \ num_tc 4 \ map 3 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3 \ queues 1@0 1@1 1@2 1@3 \ base-time 0 \ sched-entry S 0x07 500000 \ sched-entry S 0x0f 500000 \ flags 0x2 \ txtime-delay 0 Note that for TC3, gate should open for 500us and close for another 500us. 3. Take tcpdump log on Board B. 4. Send udp packets via UDP tai app from Board A to Board B. 5. Analyze tcpdump log via wireshark log on Board B. Ensure that the total time from the first to the last packet received during one cycle for TC3 does not exceed 500us. Fixes: 43546211738e ("igc: Add new device ID's") Signed-off-by: Faizal Rahim Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 6 ++++ drivers/net/ethernet/intel/igc/igc_tsn.c | 34 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 5f92b3c7c3d4..511384f3ec5c 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -404,6 +404,12 @@ #define IGC_DTXMXPKTSZ_TSN 0x19 /* 1600 bytes of max TX DMA packet size */ #define IGC_DTXMXPKTSZ_DEFAULT 0x98 /* 9728-byte Jumbo frames */ +/* Retry Buffer Control */ +#define IGC_RETX_CTL 0x041C +#define IGC_RETX_CTL_WATERMARK_MASK 0xF +#define IGC_RETX_CTL_QBVFULLTH_SHIFT 8 /* QBV Retry Buffer Full Threshold */ +#define IGC_RETX_CTL_QBVFULLEN 0x1000 /* Enable QBV Retry Buffer Full Threshold */ + /* Transmit Scheduling Latency */ /* Latency between transmission scheduling (LaunchTime) and the time * the packet is transmitted to the network in nanosecond. diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 22cefb1eeedf..46d4c3275bbb 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -78,6 +78,15 @@ void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter) wr32(IGC_GTXOFFSET, txoffset); } +static void igc_tsn_restore_retx_default(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + u32 retxctl; + + retxctl = rd32(IGC_RETX_CTL) & IGC_RETX_CTL_WATERMARK_MASK; + wr32(IGC_RETX_CTL, retxctl); +} + /* Returns the TSN specific registers to their default values after * the adapter is reset. */ @@ -91,6 +100,9 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) wr32(IGC_TXPBS, I225_TXPBSIZE_DEFAULT); wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_DEFAULT); + if (igc_is_device_id_i226(hw)) + igc_tsn_restore_retx_default(adapter); + tqavctrl = rd32(IGC_TQAVCTRL); tqavctrl &= ~(IGC_TQAVCTRL_TRANSMIT_MODE_TSN | IGC_TQAVCTRL_ENHANCED_QAV | IGC_TQAVCTRL_FUTSCDDIS); @@ -111,6 +123,25 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) return 0; } +/* To partially fix i226 HW errata, reduce MAC internal buffering from 192 Bytes + * to 88 Bytes by setting RETX_CTL register using the recommendation from: + * a) Ethernet Controller I225/I226 Specification Update Rev 2.1 + * Item 9: TSN: Packet Transmission Might Cross the Qbv Window + * b) I225/6 SW User Manual Rev 1.2.4: Section 8.11.5 Retry Buffer Control + */ +static void igc_tsn_set_retx_qbvfullthreshold(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + u32 retxctl, watermark; + + retxctl = rd32(IGC_RETX_CTL); + watermark = retxctl & IGC_RETX_CTL_WATERMARK_MASK; + /* Set QBVFULLTH value using watermark and set QBVFULLEN */ + retxctl |= (watermark << IGC_RETX_CTL_QBVFULLTH_SHIFT) | + IGC_RETX_CTL_QBVFULLEN; + wr32(IGC_RETX_CTL, retxctl); +} + static int igc_tsn_enable_offload(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; @@ -123,6 +154,9 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_TSN); wr32(IGC_TXPBS, IGC_TXPBSIZE_TSN); + if (igc_is_device_id_i226(hw)) + igc_tsn_set_retx_qbvfullthreshold(adapter); + for (i = 0; i < adapter->num_tx_queues; i++) { struct igc_ring *ring = adapter->tx_ring[i]; u32 txqctl = 0; From f8d6acaee9d35cbff3c3cfad94641666c596f8da Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sun, 7 Jul 2024 08:53:16 -0400 Subject: [PATCH 0600/1052] igc: Fix qbv_config_change_errors logics When user issues these cmds: 1. Either a) or b) a) mqprio with hardware offload disabled b) taprio with txtime-assist feature enabled 2. etf 3. tc qdisc delete 4. taprio with base time in the past At step 4, qbv_config_change_errors wrongly increased by 1. Excerpt from IEEE 802.1Q-2018 8.6.9.3.1: "If AdminBaseTime specifies a time in the past, and the current schedule is running, then: Increment ConfigChangeError counter" qbv_config_change_errors should only increase if base time is in the past and no taprio is active. In user perspective, taprio was not active when first triggered at step 4. However, i225/6 reuses qbv for etf, so qbv is enabled with a dummy schedule at step 2 where it enters igc_tsn_enable_offload() and qbv_count got incremented to 1. At step 4, it enters igc_tsn_enable_offload() again, qbv_count is incremented to 2. Because taprio is running, tc_setup_type is TC_SETUP_QDISC_ETF and qbv_count > 1, qbv_config_change_errors value got incremented. This issue happens due to reliance on qbv_count field where a non-zero value indicates that taprio is running. But qbv_count increases regardless if taprio is triggered by user or by other tsn feature. It does not align with qbv_config_change_errors expectation where it is only concerned with taprio triggered by user. Fixing this by relocating the qbv_config_change_errors logic to igc_save_qbv_schedule(), eliminating reliance on qbv_count and its inaccuracies from i225/6's multiple uses of qbv feature for other TSN features. The new function created: igc_tsn_is_taprio_activated_by_user() uses taprio_offload_enable field to indicate that the current running taprio was triggered by user, instead of triggered by non-qbv feature like etf. Fixes: ae4fe4698300 ("igc: Add qbv_config_change_errors counter") Signed-off-by: Faizal Rahim Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 8 ++++++-- drivers/net/ethernet/intel/igc/igc_tsn.c | 16 ++++++++-------- drivers/net/ethernet/intel/igc/igc_tsn.h | 1 + 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 8daf938afc36..dfd6c00b4205 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6315,12 +6315,16 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, if (!validate_schedule(adapter, qopt)) return -EINVAL; + igc_ptp_read(adapter, &now); + + if (igc_tsn_is_taprio_activated_by_user(adapter) && + is_base_time_past(qopt->base_time, &now)) + adapter->qbv_config_change_errors++; + adapter->cycle_time = qopt->cycle_time; adapter->base_time = qopt->base_time; adapter->taprio_offload_enable = true; - igc_ptp_read(adapter, &now); - for (n = 0; n < qopt->num_entries; n++) { struct tc_taprio_sched_entry *e = &qopt->entries[n]; diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 46d4c3275bbb..8ed7b965484d 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -87,6 +87,14 @@ static void igc_tsn_restore_retx_default(struct igc_adapter *adapter) wr32(IGC_RETX_CTL, retxctl); } +bool igc_tsn_is_taprio_activated_by_user(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + + return (rd32(IGC_BASET_H) || rd32(IGC_BASET_L)) && + adapter->taprio_offload_enable; +} + /* Returns the TSN specific registers to their default values after * the adapter is reset. */ @@ -296,14 +304,6 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) s64 n = div64_s64(ktime_sub_ns(systim, base_time), cycle); base_time = ktime_add_ns(base_time, (n + 1) * cycle); - - /* Increase the counter if scheduling into the past while - * Gate Control List (GCL) is running. - */ - if ((rd32(IGC_BASET_H) || rd32(IGC_BASET_L)) && - (adapter->tc_setup_type == TC_SETUP_QDISC_TAPRIO) && - (adapter->qbv_count > 1)) - adapter->qbv_config_change_errors++; } else { if (igc_is_device_id_i226(hw)) { ktime_t adjust_time, expires_time; diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.h b/drivers/net/ethernet/intel/igc/igc_tsn.h index b53e6af560b7..98ec845a86bf 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.h +++ b/drivers/net/ethernet/intel/igc/igc_tsn.h @@ -7,5 +7,6 @@ int igc_tsn_offload_apply(struct igc_adapter *adapter); int igc_tsn_reset(struct igc_adapter *adapter); void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter); +bool igc_tsn_is_taprio_activated_by_user(struct igc_adapter *adapter); #endif /* _IGC_BASE_H */ From 0afeaeb5dae86aceded0d5f0c3a54d27858c0c6f Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sun, 7 Jul 2024 08:53:17 -0400 Subject: [PATCH 0601/1052] igc: Fix reset adapter logics when tx mode change Following the "igc: Fix TX Hang issue when QBV Gate is close" changes, remaining issues with the reset adapter logic in igc_tsn_offload_apply() have been observed: 1. The reset adapter logics for i225 and i226 differ, although they should be the same according to the guidelines in I225/6 HW Design Section 7.5.2.1 on software initialization during tx mode changes. 2. The i225 resets adapter every time, even though tx mode doesn't change. This occurs solely based on the condition igc_is_device_id_i225() when calling schedule_work(). 3. i226 doesn't reset adapter for tsn->legacy tx mode changes. It only resets adapter for legacy->tsn tx mode transitions. 4. qbv_count introduced in the patch is actually not needed; in this context, a non-zero value of qbv_count is used to indicate if tx mode was unconditionally set to tsn in igc_tsn_enable_offload(). This could be replaced by checking the existing register IGC_TQAVCTRL_TRANSMIT_MODE_TSN bit. This patch resolves all issues and enters schedule_work() to reset the adapter only when changing tx mode. It also removes reliance on qbv_count. qbv_count field will be removed in a future patch. Test ran: 1. Verify reset adapter behaviour in i225/6: a) Enrol a new GCL Reset adapter observed (tx mode change legacy->tsn) b) Enrol a new GCL without deleting qdisc No reset adapter observed (tx mode remain tsn->tsn) c) Delete qdisc Reset adapter observed (tx mode change tsn->legacy) 2. Tested scenario from "igc: Fix TX Hang issue when QBV Gate is closed" to confirm it remains resolved. Fixes: 175c241288c0 ("igc: Fix TX Hang issue when QBV Gate is closed") Signed-off-by: Faizal Rahim Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_tsn.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 8ed7b965484d..ada751430517 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -49,6 +49,13 @@ static unsigned int igc_tsn_new_flags(struct igc_adapter *adapter) return new_flags; } +static bool igc_tsn_is_tx_mode_in_tsn(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + + return !!(rd32(IGC_TQAVCTRL) & IGC_TQAVCTRL_TRANSMIT_MODE_TSN); +} + void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; @@ -365,15 +372,22 @@ int igc_tsn_reset(struct igc_adapter *adapter) return err; } +static bool igc_tsn_will_tx_mode_change(struct igc_adapter *adapter) +{ + bool any_tsn_enabled = !!(igc_tsn_new_flags(adapter) & + IGC_FLAG_TSN_ANY_ENABLED); + + return (any_tsn_enabled && !igc_tsn_is_tx_mode_in_tsn(adapter)) || + (!any_tsn_enabled && igc_tsn_is_tx_mode_in_tsn(adapter)); +} + int igc_tsn_offload_apply(struct igc_adapter *adapter) { - struct igc_hw *hw = &adapter->hw; - - /* Per I225/6 HW Design Section 7.5.2.1, transmit mode - * cannot be changed dynamically. Require reset the adapter. + /* Per I225/6 HW Design Section 7.5.2.1 guideline, if tx mode change + * from legacy->tsn or tsn->legacy, then reset adapter is needed. */ if (netif_running(adapter->netdev) && - (igc_is_device_id_i225(hw) || !adapter->qbv_count)) { + igc_tsn_will_tx_mode_change(adapter)) { schedule_work(&adapter->reset_task); return 0; } From 6c3fc0b1c3d073bd6fc3bf43dbd0e64240537464 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sun, 7 Jul 2024 08:53:18 -0400 Subject: [PATCH 0602/1052] igc: Fix qbv tx latency by setting gtxoffset A large tx latency issue was discovered during testing when only QBV was enabled. The issue occurs because gtxoffset was not set when QBV is active, it was only set when launch time is active. The patch "igc: Correct the launchtime offset" only sets gtxoffset when the launchtime_enable field is set by the user. Enabling launchtime_enable ultimately sets the register IGC_TXQCTL_QUEUE_MODE_LAUNCHT (referred to as LaunchT in the SW user manual). Section 7.5.2.6 of the IGC i225/6 SW User Manual Rev 1.2.4 states: "The latency between transmission scheduling (launch time) and the time the packet is transmitted to the network is listed in Table 7-61." However, the patch misinterprets the phrase "launch time" in that section by assuming it specifically refers to the LaunchT register, whereas it actually denotes the generic term for when a packet is released from the internal buffer to the MAC transmit logic. This launch time, as per that section, also implicitly refers to the QBV gate open time, where a packet waits in the buffer for the QBV gate to open. Therefore, latency applies whenever QBV is in use. TSN features such as QBU and QAV reuse QBV, making the latency universal to TSN features. Discussed with i226 HW owner (Shalev, Avi) and we were in agreement that the term "launch time" used in Section 7.5.2.6 is not clear and can be easily misinterpreted. Avi will update this section to: "When TQAVCTRL.TRANSMIT_MODE = TSN, the latency between transmission scheduling and the time the packet is transmitted to the network is listed in Table 7-61." Fix this issue by using igc_tsn_is_tx_mode_in_tsn() as a condition to write to gtxoffset, aligning with the newly updated SW User Manual. Tested: 1. Enrol taprio on talker board base-time 0 cycle-time 1000000 flags 0x2 index 0 cmd S gatemask 0x1 interval1 index 0 cmd S gatemask 0x1 interval2 Note: interval1 = interval for a 64 bytes packet to go through interval2 = cycle-time - interval1 2. Take tcpdump on listener board 3. Use udp tai app on talker to send packets to listener 4. Check the timestamp on listener via wireshark Test Result: 100 Mbps: 113 ~193 ns 1000 Mbps: 52 ~ 84 ns 2500 Mbps: 95 ~ 223 ns Note that the test result is similar to the patch "igc: Correct the launchtime offset". Fixes: 790835fcc0cb ("igc: Correct the launchtime offset") Signed-off-by: Faizal Rahim Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index ada751430517..d68fa7f3d5f0 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -61,7 +61,7 @@ void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter) struct igc_hw *hw = &adapter->hw; u16 txoffset; - if (!is_any_launchtime(adapter)) + if (!igc_tsn_is_tx_mode_in_tsn(adapter)) return; switch (adapter->link_speed) { From 541b80216cd1d511841c3c8d9559a7b13f4f79f2 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 Jul 2024 17:20:48 +0200 Subject: [PATCH 0603/1052] Bluetooth: hci_qca: don't call pwrseq_power_off() twice for QCA6390 Now that we call pwrseq_power_off() for all models that hold a valid power sequencing handle, we can remove the switch case for QCA_6390. The switch will now use the default label for this model but that's fine: if it has the BT-enable GPIO than we should use it. Fixes: eba1718717b0 ("Bluetooth: hci_qca: make pwrseq calls the default if available") Signed-off-by: Bartosz Golaszewski Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index ca6466676902..a20dd5015346 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2187,10 +2187,6 @@ static void qca_power_shutdown(struct hci_uart *hu) } break; - case QCA_QCA6390: - pwrseq_power_off(qcadev->bt_power->pwrseq); - break; - default: gpiod_set_value_cansleep(qcadev->bt_en, 0); } From f3660957303b822e5dd6e10fe9e1e19afc6d33de Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 Jul 2024 17:20:49 +0200 Subject: [PATCH 0604/1052] Bluetooth: hci_qca: fix QCA6390 support on non-DT platforms QCA6390 can albo be used on non-DT systems so we must not make the power sequencing the only option. Check if the serdev device consumes an OF node. If so: honor the new contract as per the DT bindings. If not: fall back to the previous behavior by falling through to the existing default label. Fixes: 9a15ce685706 ("Bluetooth: qca: use the power sequencer for QCA6390") Reported-by: Wren Turkal Closes: https://lore.kernel.org/linux-bluetooth/27e6a6c5-fb63-4219-be0b-eefa2c116e06@penguintechs.org/ Signed-off-by: Bartosz Golaszewski Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index a20dd5015346..2baed7d0f479 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2412,11 +2412,14 @@ static int qca_serdev_probe(struct serdev_device *serdev) break; case QCA_QCA6390: - qcadev->bt_power->pwrseq = devm_pwrseq_get(&serdev->dev, - "bluetooth"); - if (IS_ERR(qcadev->bt_power->pwrseq)) - return PTR_ERR(qcadev->bt_power->pwrseq); - break; + if (dev_of_node(&serdev->dev)) { + qcadev->bt_power->pwrseq = devm_pwrseq_get(&serdev->dev, + "bluetooth"); + if (IS_ERR(qcadev->bt_power->pwrseq)) + return PTR_ERR(qcadev->bt_power->pwrseq); + break; + } + fallthrough; default: qcadev->bt_en = devm_gpiod_get_optional(&serdev->dev, "enable", From e1d28be268cfe978e19a18a3a9b37a4a7d37745e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 Jul 2024 17:20:50 +0200 Subject: [PATCH 0605/1052] Bluetooth: hci_qca: fix a NULL-pointer derefence at shutdown Unlike qca_regulator_init(), qca_power_shutdown() may be called for QCA_ROME which does not have qcadev->bt_power assigned. Add a NULL-pointer check before dereferencing the struct qca_power pointer. Fixes: eba1718717b0 ("Bluetooth: hci_qca: make pwrseq calls the default if available") Reported-by: Dmitry Baryshkov Closes: https://lore.kernel.org/linux-bluetooth/su3wp6s44hrxf4ijvsdfzbvv4unu4ycb7kkvwbx6ltdafkldir@4g7ydqm2ap5j/ Signed-off-by: Bartosz Golaszewski Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 2baed7d0f479..45adc1560d94 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2160,7 +2160,7 @@ static void qca_power_shutdown(struct hci_uart *hu) qcadev = serdev_device_get_drvdata(hu->serdev); power = qcadev->bt_power; - if (power->pwrseq) { + if (power && power->pwrseq) { pwrseq_power_off(power->pwrseq); set_bit(QCA_BT_OFF, &qca->flags); return; From c531e63871c0b50c8c4e62c048535a08886fba3e Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 31 Jul 2024 12:19:36 +0300 Subject: [PATCH 0606/1052] Bluetooth: l2cap: always unlock channel in l2cap_conless_channel() Add missing call to 'l2cap_chan_unlock()' on receive error handling path in 'l2cap_conless_channel()'. Fixes: a24cce144b98 ("Bluetooth: Fix reference counting of global L2CAP channels") Reported-by: syzbot+45ac74737e866894acb0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=45ac74737e866894acb0 Signed-off-by: Dmitry Antipov Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index c3c26bbb5dda..9988ba382b68 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6774,6 +6774,7 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, bt_cb(skb)->l2cap.psm = psm; if (!chan->ops->recv(chan, skb)) { + l2cap_chan_unlock(chan); l2cap_chan_put(chan); return; } From b5431dc2803ac159d6d4645ae237d15c3cb252db Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 29 Jul 2024 21:58:10 +0200 Subject: [PATCH 0607/1052] Bluetooth: hci_sync: avoid dup filtering when passive scanning with adv monitor This restores behaviour (including the comment) from now-removed hci_request.c, and also matches existing code for active scanning. Without this, the duplicates filter is always active when passive scanning, which makes it impossible to work with devices that send nontrivial dynamic data in their advertisement reports. Fixes: abfeea476c68 ("Bluetooth: hci_sync: Convert MGMT_OP_START_DISCOVERY") Signed-off-by: Anton Khirnov Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index a31d39a821f4..e79cd40bd079 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3019,6 +3019,20 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) } else if (hci_is_adv_monitoring(hdev)) { window = hdev->le_scan_window_adv_monitor; interval = hdev->le_scan_int_adv_monitor; + + /* Disable duplicates filter when scanning for advertisement + * monitor for the following reasons. + * + * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm + * controllers ignore RSSI_Sampling_Period when the duplicates + * filter is enabled. + * + * For SW pattern filtering, when we're not doing interleaved + * scanning, it is necessary to disable duplicates filter, + * otherwise hosts can only receive one advertisement and it's + * impossible to know if a peer is still in range. + */ + filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; From 11893e144ed75be55d99349760513ca104781fc0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 Aug 2024 15:06:45 -0600 Subject: [PATCH 0608/1052] io_uring/net: ensure expanded bundle recv gets marked for cleanup If the iovec inside the kmsg isn't already allocated AND one gets expanded beyond the fixed size, then the request may not already have been marked for cleanup. Ensure that it is. Cc: stable@vger.kernel.org Fixes: 2f9c9515bdfd ("io_uring/net: support bundles for recv") Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index 594490a1389b..97a48408cec3 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1094,6 +1094,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { kmsg->free_iov_nr = ret; kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; } } else { void __user *buf; From 70ed519ed59da3a92c3acedeb84a30e5a66051ce Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 Aug 2024 15:08:17 -0600 Subject: [PATCH 0609/1052] io_uring/net: ensure expanded bundle send gets marked for cleanup If the iovec inside the kmsg isn't already allocated AND one gets expanded beyond the fixed size, then the request may not already have been marked for cleanup. Ensure that it is. Cc: stable@vger.kernel.org Fixes: a05d1f625c7a ("io_uring/net: support bundles for send") Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index 97a48408cec3..050bea5e7256 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -623,6 +623,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { kmsg->free_iov_nr = ret; kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; } } From 8fe8ac24adcd76b12edbfdefa078567bfff117d4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 Aug 2024 15:09:33 -0600 Subject: [PATCH 0610/1052] io_uring/net: don't pick multiple buffers for non-bundle send If a send is issued marked with IOSQE_BUFFER_SELECT for selecting a buffer, unless it's a bundle, it should not select multiple buffers. Cc: stable@vger.kernel.org Fixes: a05d1f625c7a ("io_uring/net: support bundles for send") Signed-off-by: Jens Axboe --- io_uring/net.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 050bea5e7256..d08abcca89cc 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -601,17 +601,18 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) .iovs = &kmsg->fast_iov, .max_len = INT_MAX, .nr_iovs = 1, - .mode = KBUF_MODE_EXPAND, }; if (kmsg->free_iov) { arg.nr_iovs = kmsg->free_iov_nr; arg.iovs = kmsg->free_iov; - arg.mode |= KBUF_MODE_FREE; + arg.mode = KBUF_MODE_FREE; } if (!(sr->flags & IORING_RECVSEND_BUNDLE)) arg.nr_iovs = 1; + else + arg.mode |= KBUF_MODE_EXPAND; ret = io_buffers_select(req, &arg, issue_flags); if (unlikely(ret < 0)) From b1560408692cd0ab0370cfbe9deb03ce97ab3f6d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 30 Jul 2024 11:06:57 -0400 Subject: [PATCH 0611/1052] tracing: Have format file honor EVENT_FILE_FL_FREED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When eventfs was introduced, special care had to be done to coordinate the freeing of the file meta data with the files that are exposed to user space. The file meta data would have a ref count that is set when the file is created and would be decremented and freed after the last user that opened the file closed it. When the file meta data was to be freed, it would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed, and any new references made (like new opens or reads) would fail as it is marked freed. This allowed other meta data to be freed after this flag was set (under the event_mutex). All the files that were dynamically created in the events directory had a pointer to the file meta data and would call event_release() when the last reference to the user space file was closed. This would be the time that it is safe to free the file meta data. A shortcut was made for the "format" file. It's i_private would point to the "call" entry directly and not point to the file's meta data. This is because all format files are the same for the same "call", so it was thought there was no reason to differentiate them. The other files maintain state (like the "enable", "trigger", etc). But this meant if the file were to disappear, the "format" file would be unaware of it. This caused a race that could be trigger via the user_events test (that would create dynamic events and free them), and running a loop that would read the user_events format files: In one console run: # cd tools/testing/selftests/user_events # while true; do ./ftrace_test; done And in another console run: # cd /sys/kernel/tracing/ # while true; do cat events/user_events/__test_event/format; done 2>/dev/null With KASAN memory checking, it would trigger a use-after-free bug report (which was a real bug). This was because the format file was not checking the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the event that the file meta data pointed to after the event was freed. After inspection, there are other locations that were found to not check the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a new helper function: event_file_file() that will make sure that the event_mutex is held, and will return NULL if the trace_event_file has the EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file pointer use event_file_file() and check for NULL. Later uses can still use the event_file_data() helper function if the event_mutex is still held and was not released since the event_file_file() call. Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Ajay Kaher Cc: Ilkka Naulapää Cc: Linus Torvalds Cc: Al Viro Cc: Dan Carpenter Cc: Beau Belgrave Cc: Florian Fainelli Cc: Alexey Makhalov Cc: Vasavi Sirnapalli Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode") Reported-by: Mathias Krause Tested-by: Mathias Krause Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 23 ++++++++++++++++++++ kernel/trace/trace_events.c | 33 +++++++++++++++++------------ kernel/trace/trace_events_hist.c | 4 ++-- kernel/trace/trace_events_inject.c | 2 +- kernel/trace/trace_events_trigger.c | 6 +++--- 5 files changed, 49 insertions(+), 19 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8783bebd0562..bd3e3069300e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1634,6 +1634,29 @@ static inline void *event_file_data(struct file *filp) extern struct mutex event_mutex; extern struct list_head ftrace_events; +/* + * When the trace_event_file is the filp->i_private pointer, + * it must be taken under the event_mutex lock, and then checked + * if the EVENT_FILE_FL_FREED flag is set. If it is, then the + * data pointed to by the trace_event_file can not be trusted. + * + * Use the event_file_file() to access the trace_event_file from + * the filp the first time under the event_mutex and check for + * NULL. If it is needed to be retrieved again and the event_mutex + * is still held, then the event_file_data() can be used and it + * is guaranteed to be valid. + */ +static inline struct trace_event_file *event_file_file(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = READ_ONCE(file_inode(filp)->i_private); + if (!file || file->flags & EVENT_FILE_FL_FREED) + return NULL; + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6ef29eba90ce..f08fbaf8cad6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1386,12 +1386,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[4] = "0"; mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); - if (!file || flags & EVENT_FILE_FL_FREED) + if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1424,8 +1424,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 1: ret = -ENODEV; mutex_lock(&event_mutex); - file = event_file_data(filp); - if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { + file = event_file_file(filp); + if (likely(file)) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1540,7 +1540,8 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -1572,7 +1573,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct ftrace_event_field *field; const char *array_descriptor; @@ -1627,12 +1629,14 @@ static int f_show(struct seq_file *m, void *v) static void *f_start(struct seq_file *m, loff_t *pos) { + struct trace_event_file *file; void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - if (!event_file_data(m->private)) + file = event_file_file(m->private); + if (!file) return ERR_PTR(-ENODEV); while (l < *pos && p) @@ -1706,8 +1710,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file && !(file->flags & EVENT_FILE_FL_FREED)) + file = event_file_file(filp); + if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -1736,9 +1740,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return PTR_ERR(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file) - err = apply_event_filter(file, buf); + file = event_file_file(filp); + if (file) { + if (file->flags & EVENT_FILE_FL_FREED) + err = -ENODEV; + else + err = apply_event_filter(file, buf); + } mutex_unlock(&event_mutex); kfree(buf); @@ -2485,7 +2493,6 @@ static int event_callback(const char *name, umode_t *mode, void **data, if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; - *data = call; return 1; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6ece1308d36a..5f9119eb7c67 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5601,7 +5601,7 @@ static int hist_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; @@ -5880,7 +5880,7 @@ static int hist_debug_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 8650562bdaa9..a8f076809db4 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, strim(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (file) { call = file->event_call; size = parse_entry(buf, call, &entry); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4bec043c8690..a5e3d6acf1e1 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) return ERR_PTR(-ENODEV); @@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) mutex_lock(&event_mutex); - if (unlikely(!event_file_data(file))) { + if (unlikely(!event_file_file(file))) { mutex_unlock(&event_mutex); return -ENODEV; } @@ -293,7 +293,7 @@ static ssize_t event_trigger_regex_write(struct file *file, strim(buf); mutex_lock(&event_mutex); - event_file = event_file_data(file); + event_file = event_file_file(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); kfree(buf); From 6e2fdceffdc6bd7b8ba314a1d1b976721533c8f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 26 Jul 2024 14:42:08 -0400 Subject: [PATCH 0612/1052] tracing: Use refcount for trace_event_file reference counter Instead of using an atomic counter for the trace_event_file reference counter, use the refcount interface. It has various checks to make sure the reference counting is correct, and will warn if it detects an error (like refcount_inc() on '0'). Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240726144208.687cce24@rorschach.local.home Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- kernel/trace/trace_events.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..fed58e54f15e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -680,7 +680,7 @@ struct trace_event_file { * caching and such. Which is mostly OK ;-) */ unsigned long flags; - atomic_t ref; /* ref count for opened files */ + refcount_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f08fbaf8cad6..7266ec2a4eea 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -992,18 +992,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) void event_file_get(struct trace_event_file *file) { - atomic_inc(&file->ref); + refcount_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { - if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } - if (atomic_dec_and_test(&file->ref)) { + if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; @@ -3003,7 +3003,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); - event_file_get(file); + refcount_set(&file->ref, 1); return file; } From f2aaed194a54d78c307c44d1829c7e1ba67e9ba5 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 4 Dec 2023 16:35:04 -0500 Subject: [PATCH 0613/1052] drm/amd/display: Replace dm_execute_dmub_cmd with dc_wake_and_execute_dmub_cmd In the commit c2cec7a872b6 ("drm/amd/display: Wake DMCUB before sending a command for replay feature"), replaced dm_execute_dmub_cmd with dc_wake_and_execute_dmub_cmd in multiple areas, but due to merge issues the replacement of this function in the dmub_replay_copy_settings was missed. This commit replaces the old dm_execute_dmub_cmd with dc_wake_and_execute_dmub_cmd. Fixes: 3601a35a2e9d ("drm/amd/display: Wake DMCUB before sending a command for replay feature") Reviewed-by: Aurabindo Pillai Tested-by: Daniel Wheeler Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit 6cc213b9aa34bc3213e20f9256345c5cc1495b0b) --- drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c index 2a21bcf5224f..4d960dc5ce89 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c @@ -185,8 +185,7 @@ static bool dmub_replay_copy_settings(struct dmub_replay *dmub, else copy_settings_data->flags.bitfields.force_wakeup_by_tps3 = 0; - - dm_execute_dmub_cmd(dc, &cmd, DM_DMUB_WAIT_TYPE_WAIT); + dc_wake_and_execute_dmub_cmd(dc, &cmd, DM_DMUB_WAIT_TYPE_WAIT); return true; } From 4df19b14f6311c860223f349356da2c08ae92101 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 22 Jul 2024 16:53:42 -0600 Subject: [PATCH 0614/1052] drm/amd/display: Add missing DET segments programming The commit 5034b935f62a ("drm/amd/display: Modify DHCUB waterwark structures and functions") introduced a code refactor for DCHUB, but during the merge process into amd-staging-drm-next, the program det segments were removed. This commit adds the DET segment programming for DCN35. Fixes: 5034b935f62a ("drm/amd/display: Modify DHCUB waterwark structures and functions") Reviewed-by: Aurabindo Pillai Tested-by: Daniel Wheeler Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit 675d9ac9d0de765531e94f9fdc536989a997a324) --- drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index e4f7078c1026..f115c7a285e7 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -771,6 +771,8 @@ void dcn35_init_pipes(struct dc *dc, struct dc_state *context) if (hubbub && hubp) { if (hubbub->funcs->program_det_size) hubbub->funcs->program_det_size(hubbub, hubp->inst, 0); + if (hubbub->funcs->program_det_segments) + hubbub->funcs->program_det_segments(hubbub, hubp->inst, 0); } } From 437cf8bb0e1a56fa0491610706ddafd04b3b1a9b Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 22 Jul 2024 20:40:22 -0600 Subject: [PATCH 0615/1052] drm/amd/display: Add dcc propagation value Initialize the field dcc_meta_propagation_delay_us with 10 ms. Reviewed-by: Aurabindo Pillai Tested-by: Daniel Wheeler Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit 74bad61c5d83f5af8a855c8b7dc8e20377c74d46) --- drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c index a05a2209a44e..34b02147881d 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c @@ -723,6 +723,7 @@ static const struct dc_debug_options debug_defaults_drv = { .min_prefetch_in_strobe_ns = 60000, // 60us .disable_unbounded_requesting = false, .enable_legacy_fast_update = false, + .dcc_meta_propagation_delay_us = 10, .fams2_config = { .bits = { .enable = true, From eb880ffddd5da8a014669deaf7bb3e7e9ecd06f4 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 22 Jul 2024 20:33:40 -0600 Subject: [PATCH 0616/1052] drm/amd/display: Add missing mcache registers Add missing register programming for mcache in DCN401. Reviewed-by: Aurabindo Pillai Tested-by: Daniel Wheeler Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit a00a177055cced5cd2bb057a1ace9a95a286bc49) --- .../gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.h b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.h index 26efeada4f41..bb46f30d11d0 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.h +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.h @@ -138,7 +138,9 @@ void dcn401_prepare_mcache_programming(struct dc *dc, struct dc_state *context); SRI_ARR(DCHUBP_MALL_CONFIG, HUBP, id), \ SRI_ARR(DCHUBP_VMPG_CONFIG, HUBP, id), \ SRI_ARR(UCLK_PSTATE_FORCE, HUBPREQ, id), \ - HUBP_3DLUT_FL_REG_LIST_DCN401(id) + HUBP_3DLUT_FL_REG_LIST_DCN401(id), \ + SRI_ARR(DCSURF_VIEWPORT_MCACHE_SPLIT_COORDINATE, HUBP, id), \ + SRI_ARR(DCHUBP_MCACHEID_CONFIG, HUBP, id) /* ABM */ #define ABM_DCN401_REG_LIST_RI(id) \ From 5f142b3826a0d223e947501fa9fe4ca912d9db26 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Mon, 29 Jul 2024 09:24:20 +0800 Subject: [PATCH 0617/1052] drm/amd/pm: update powerplay structure on smu v14.0.2/3 update powerplay structure on smu v14.0.2/3 Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit f905d0c328b440fabaaf265350bf4187ccd5f59b) --- .../amd/pm/swsmu/inc/smu_v14_0_2_pptable.h | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0_2_pptable.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0_2_pptable.h index 4a3fde89aed7..75c921e87360 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0_2_pptable.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0_2_pptable.h @@ -27,7 +27,8 @@ #pragma pack(push, 1) -#define SMU_14_0_2_TABLE_FORMAT_REVISION 3 +#define SMU_14_0_2_TABLE_FORMAT_REVISION 23 +#define SMU_14_0_2_CUSTOM_TABLE_FORMAT_REVISION 1 // POWERPLAYTABLE::ulPlatformCaps #define SMU_14_0_2_PP_PLATFORM_CAP_POWERPLAY 0x1 // This cap indicates whether CCC need to show Powerplay page. @@ -43,6 +44,7 @@ #define SMU_14_0_2_PP_THERMALCONTROLLER_NONE 0 #define SMU_14_0_2_PP_OVERDRIVE_VERSION 0x1 // TODO: FIX OverDrive Version TBD +#define SMU_14_0_2_PP_CUSTOM_OVERDRIVE_VERSION 0x1 #define SMU_14_0_2_PP_POWERSAVINGCLOCK_VERSION 0x01 // Power Saving Clock Table Version 1.00 enum SMU_14_0_2_OD_SW_FEATURE_CAP @@ -107,6 +109,7 @@ enum SMU_14_0_2_PWRMODE_SETTING SMU_14_0_2_PMSETTING_ACOUSTIC_LIMIT_RPM_BALANCE, SMU_14_0_2_PMSETTING_ACOUSTIC_LIMIT_RPM_TURBO, SMU_14_0_2_PMSETTING_ACOUSTIC_LIMIT_RPM_RAGE, + SMU_14_0_2_PMSETTING_COUNT }; #define SMU_14_0_2_MAX_PMSETTING 32 // Maximum Number of PowerMode Settings @@ -127,17 +130,24 @@ struct smu_14_0_2_overdrive_table int16_t pm_setting[SMU_14_0_2_MAX_PMSETTING]; // Optimized power mode feature settings }; +enum smu_14_0_3_pptable_source { + PPTABLE_SOURCE_IFWI = 0, + PPTABLE_SOURCE_DRIVER_HARDCODED = 1, + PPTABLE_SOURCE_PPGEN_REGISTRY = 2, + PPTABLE_SOURCE_MAX = PPTABLE_SOURCE_PPGEN_REGISTRY, +}; + struct smu_14_0_2_powerplay_table { struct atom_common_table_header header; // header.format_revision = 3 (HAS TO MATCH SMU_14_0_2_TABLE_FORMAT_REVISION), header.content_revision = ? structuresize is calculated by PPGen. uint8_t table_revision; // PPGen use only: table_revision = 3 - uint8_t padding; // Padding 1 byte to align table_size offset to 6 bytes (pmfw_start_offset, for PMFW to know the starting offset of PPTable_t). + uint8_t pptable_source; // PPGen UI dropdown box uint16_t pmfw_pptable_start_offset; // The start offset of the pmfw portion. i.e. start of PPTable_t (start of SkuTable_t) uint16_t pmfw_pptable_size; // The total size of pmfw_pptable, i.e PPTable_t. - uint16_t pmfw_pfe_table_start_offset; // The start offset of the PFE_Settings_t within pmfw_pptable. - uint16_t pmfw_pfe_table_size; // The size of PFE_Settings_t. - uint16_t pmfw_board_table_start_offset; // The start offset of the BoardTable_t within pmfw_pptable. - uint16_t pmfw_board_table_size; // The size of BoardTable_t. + uint16_t pmfw_sku_table_start_offset; // DO NOT CHANGE ORDER; The absolute start offset of the SkuTable_t (within smu_14_0_3_powerplay_table). + uint16_t pmfw_sku_table_size; // DO NOT CHANGE ORDER; The size of SkuTable_t. + uint16_t pmfw_board_table_start_offset; // The start offset of the BoardTable_t + uint16_t pmfw_board_table_size; // The size of BoardTable_t. uint16_t pmfw_custom_sku_table_start_offset; // The start offset of the CustomSkuTable_t within pmfw_pptable. uint16_t pmfw_custom_sku_table_size; // The size of the CustomSkuTable_t. uint32_t golden_pp_id; // PPGen use only: PP Table ID on the Golden Data Base @@ -159,6 +169,36 @@ struct smu_14_0_2_powerplay_table PPTable_t smc_pptable; // PPTable_t in driver_if.h -- as requested by PMFW, this offset should start at a 32-byte boundary, and the table_size above should remain at offset=6 bytes }; +enum SMU_14_0_2_CUSTOM_OD_SW_FEATURE_CAP { + SMU_14_0_2_CUSTOM_ODCAP_POWER_MODE = 0, + SMU_14_0_2_CUSTOM_ODCAP_COUNT +}; + +enum SMU_14_0_2_CUSTOM_OD_FEATURE_SETTING_ID { + SMU_14_0_2_CUSTOM_ODSETTING_POWER_MODE = 0, + SMU_14_0_2_CUSTOM_ODSETTING_COUNT, +}; + +struct smu_14_0_2_custom_overdrive_table { + uint8_t revision; + uint8_t reserve[3]; + uint8_t cap[SMU_14_0_2_CUSTOM_ODCAP_COUNT]; + int32_t max[SMU_14_0_2_CUSTOM_ODSETTING_COUNT]; + int32_t min[SMU_14_0_2_CUSTOM_ODSETTING_COUNT]; + int16_t pm_setting[SMU_14_0_2_PMSETTING_COUNT]; +}; + +struct smu_14_0_3_custom_powerplay_table { + uint8_t custom_table_revision; + uint16_t custom_table_size; + uint16_t custom_sku_table_offset; + uint32_t custom_platform_caps; + uint16_t software_shutdown_temp; + struct smu_14_0_2_custom_overdrive_table custom_overdrive_table; + uint32_t reserve[8]; + CustomSkuTable_t custom_sku_table_pmfw; +}; + #pragma pack(pop) #endif From aa5c9701ebd654284c55eba30d0a38eec49f2946 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Wed, 31 Jul 2024 11:58:46 +0800 Subject: [PATCH 0618/1052] drm/amdgpu: force to use legacy inv in mmhub MMHUB v4.1.0 only support fixed cache mode, so only use legacy invalidation accordingly. Signed-off-by: Likun Gao Reviewed-by: Frank Min Signed-off-by: Alex Deucher (cherry picked from commit 9192c7613ca53572908ba23a4c3f39c7f8ba8021) --- drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c index 5bbaa2b2caab..0fbc3be81f14 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c @@ -80,7 +80,8 @@ static uint32_t mmhub_v4_1_0_get_invalidate_req(unsigned int vmid, /* invalidate using legacy mode on vmid*/ req = REG_SET_FIELD(req, MMVM_INVALIDATE_ENG0_REQ, PER_VMID_INVALIDATE_REQ, 1 << vmid); - req = REG_SET_FIELD(req, MMVM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, flush_type); + /* Only use legacy inv on mmhub side */ + req = REG_SET_FIELD(req, MMVM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, 0); req = REG_SET_FIELD(req, MMVM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1); req = REG_SET_FIELD(req, MMVM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1); req = REG_SET_FIELD(req, MMVM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1); From 07cd40a0c9843653451f9355170770f6e42489c8 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Wed, 24 Jul 2024 09:29:13 -0600 Subject: [PATCH 0619/1052] drm/amd/display: Add missing DCN314 to the DML Makefile Include display_mode_vba_314 and display_rq_dlg_calc_314 to the dml Makefile. Acked-by: Tom Chung Signed-off-by: Rodrigo Siqueira Signed-off-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 089525632d40bbfa507f224c20563529b3f8a4b3) --- drivers/gpu/drm/amd/display/dc/dml/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 3c0222aa4df1..46f9c05de16e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -83,6 +83,8 @@ CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dcn31/display_rq_dlg_calc_31.o := $(dml_rcfla CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dcn32/display_mode_vba_32.o := $(dml_rcflags) CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dcn32/display_rq_dlg_calc_32.o := $(dml_rcflags) CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dcn32/display_mode_vba_util_32.o := $(dml_rcflags) +CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dcn314/display_mode_vba_314.o := $(dml_rcflags) +CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dcn314/display_rq_dlg_calc_314.o := $(dml_rcflags) CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dcn301/dcn301_fpu.o := $(dml_rcflags) CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/display_mode_lib.o := $(dml_rcflags) CFLAGS_REMOVE_$(AMDDALPATH)/dc/dml/dsc/rc_calc_fpu.o := $(dml_rcflags) From e8097cf1ce9e7ad8516ee95f06f7baaa31506035 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 25 Jul 2024 16:41:38 -0600 Subject: [PATCH 0620/1052] drm/amd/display: Add missing program DET segment call to pipe init Add a callback that program the DET segment when initializing pipes. Acked-by: Tom Chung Signed-off-by: Rodrigo Siqueira Signed-off-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit e1dbe625d6ac2821eb29e087db46cb539d8079f0) --- drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c index e06fc370267b..ff03b1d98aa7 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c @@ -1402,6 +1402,8 @@ void dcn10_init_pipes(struct dc *dc, struct dc_state *context) if (hubbub && hubp) { if (hubbub->funcs->program_det_size) hubbub->funcs->program_det_size(hubbub, hubp->inst, 0); + if (hubbub->funcs->program_det_segments) + hubbub->funcs->program_det_segments(hubbub, hubp->inst, 0); } } From d507ae0dc83b7f43cdf6760b8f1a30aac4fc405a Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 2 Aug 2024 11:13:19 +0530 Subject: [PATCH 0621/1052] drm/buddy: Add start address support to trim function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit db65eb46de135338d6177f8853e0fd208f19d63e) --- drivers/gpu/drm/drm_buddy.c | 25 +++++++++++++++++++++++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..103c185bb1c8 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(&block->link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(&block->tmp_link, &dfs); err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, + NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, &vres->blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, &vres->blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 2a74fa9d0ce5..9689a7c5dd36 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) #define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) struct drm_buddy_block { #define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) @@ -155,6 +156,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, unsigned long flags); int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, u64 new_size, struct list_head *blocks); From 8ff3bb44cc94b74ebd57fe3be9dedb98dbf92771 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Thu, 1 Aug 2024 10:47:16 +0800 Subject: [PATCH 0622/1052] drm/amdgpu: add golden setting for gc v12 Adding Manual GDB golden setting for gc v12 revision 0 ASIC. Signed-off-by: Likun Gao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit c9875d0a789060facc274dee0d4eb6500d471772) --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index f384be0d1800..506fa8003388 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -202,6 +202,12 @@ static const struct amdgpu_hwip_reg_entry gc_gfx_queue_reg_list_12[] = { SOC15_REG_ENTRY_STR(GC, 0, regCP_IB1_BUFSZ) }; +static const struct soc15_reg_golden golden_settings_gc_12_0[] = { + SOC15_REG_GOLDEN_VALUE(GC, 0, regDB_MEM_CONFIG, 0x0000000f, 0x0000000f), + SOC15_REG_GOLDEN_VALUE(GC, 0, regCB_HW_CONTROL_1, 0x03000000, 0x03000000), + SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL5, 0x00000070, 0x00000020) +}; + #define DEFAULT_SH_MEM_CONFIG \ ((SH_MEM_ADDRESS_MODE_64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT) | \ (SH_MEM_ALIGNMENT_MODE_UNALIGNED << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \ @@ -3432,6 +3438,24 @@ static void gfx_v12_0_disable_gpa_mode(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, regCPG_PSP_DEBUG, data); } +static void gfx_v12_0_init_golden_registers(struct amdgpu_device *adev) +{ + if (amdgpu_sriov_vf(adev)) + return; + + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + if (adev->rev_id == 0) + soc15_program_register_sequence(adev, + golden_settings_gc_12_0, + (const u32)ARRAY_SIZE(golden_settings_gc_12_0)); + break; + default: + break; + } +} + static int gfx_v12_0_hw_init(void *handle) { int r; @@ -3472,6 +3496,9 @@ static int gfx_v12_0_hw_init(void *handle) } } + if (!amdgpu_emu_mode) + gfx_v12_0_init_golden_registers(adev); + adev->gfx.is_poweron = true; if (get_gb_addr_config(adev)) From 829798c789f567ef6ba4b084c15b7b5f3bd98d51 Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Thu, 7 Mar 2024 19:04:31 +0000 Subject: [PATCH 0623/1052] drm/amdgpu: Forward soft recovery errors to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we discussed before[1], soft recovery should be forwarded to userspace, or we can get into a really bad state where apps will keep submitting hanging command buffers cascading us to a hard reset. 1: https://lore.kernel.org/all/bf23d5ed-9a6b-43e7-84ee-8cbfd0d60f18@froggi.es/ Signed-off-by: Joshua Ashton Reviewed-by: Marek Olšák Signed-off-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 434967aadbbbe3ad9103cc29e9a327de20fdba01) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index e238f2832f65..908e13455152 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -264,9 +264,8 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job, struct dma_fence *fence = NULL; int r; - /* Ignore soft recovered fences here */ r = drm_sched_entity_error(s_entity); - if (r && r != -ENODATA) + if (r) goto error; if (!fence && job->gang_submit) From 5d687a67fda6389b9214815aa0d0adcc44302dc5 Mon Sep 17 00:00:00 2001 From: Frank Min Date: Fri, 2 Aug 2024 11:15:11 +0800 Subject: [PATCH 0624/1052] drm/amdgpu: change non-dcc buffer copy configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without setting cpv bit and 7th ib dw, non-dcc buffer copy will have random corruption So set the cpv bit and clear the 7th ib dw for copy non-dcc buffers Signed-off-by: Frank Min Acked-by: Christian König Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 5aacf8917fde5bc2a640f3cd49130c0e2e85e726) --- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 41b5e45697dc..7e4282609f51 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1575,8 +1575,7 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib *ib, ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) | SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0) | - SDMA_PKT_COPY_LINEAR_HEADER_CPV((copy_flags & - (AMDGPU_COPY_FLAGS_READ_DECOMPRESSED | AMDGPU_COPY_FLAGS_WRITE_COMPRESSED)) ? 1 : 0); + SDMA_PKT_COPY_LINEAR_HEADER_CPV(1); ib->ptr[ib->length_dw++] = byte_count - 1; ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */ @@ -1590,6 +1589,8 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib *ib, ((copy_flags & AMDGPU_COPY_FLAGS_READ_DECOMPRESSED) ? SDMA_DCC_READ_CM(2) : 0) | ((copy_flags & AMDGPU_COPY_FLAGS_WRITE_COMPRESSED) ? SDMA_DCC_WRITE_CM(1) : 0) | SDMA_DCC_MAX_COM(max_com) | SDMA_DCC_MAX_UCOM(1); + else + ib->ptr[ib->length_dw++] = 0; } /** From 50e376f1fe3bf571d0645ddf48ad37eb58323919 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Fri, 12 Jul 2024 16:30:03 -0400 Subject: [PATCH 0625/1052] drm/amd/display: Skip Recompute DSC Params if no Stream on Link [why] Encounter NULL pointer dereference uner mst + dsc setup. BUG: kernel NULL pointer dereference, address: 0000000000000008 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 4 PID: 917 Comm: sway Not tainted 6.3.9-arch1-1 #1 124dc55df4f5272ccb409f39ef4872fc2b3376a2 Hardware name: LENOVO 20NKS01Y00/20NKS01Y00, BIOS R12ET61W(1.31 ) 07/28/2022 RIP: 0010:drm_dp_atomic_find_time_slots+0x5e/0x260 [drm_display_helper] Code: 01 00 00 48 8b 85 60 05 00 00 48 63 80 88 00 00 00 3b 43 28 0f 8d 2e 01 00 00 48 8b 53 30 48 8d 04 80 48 8d 04 c2 48 8b 40 18 <48> 8> RSP: 0018:ffff960cc2df77d8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff8afb87e81280 RCX: 0000000000000224 RDX: ffff8afb9ee37c00 RSI: ffff8afb8da1a578 RDI: ffff8afb87e81280 RBP: ffff8afb83d67000 R08: 0000000000000001 R09: ffff8afb9652f850 R10: ffff960cc2df7908 R11: 0000000000000002 R12: 0000000000000000 R13: ffff8afb8d7688a0 R14: ffff8afb8da1a578 R15: 0000000000000224 FS: 00007f4dac35ce00(0000) GS:ffff8afe30b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000010ddc6000 CR4: 00000000003506e0 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x171/0x4e0 ? plist_add+0xbe/0x100 ? exc_page_fault+0x7c/0x180 ? asm_exc_page_fault+0x26/0x30 ? drm_dp_atomic_find_time_slots+0x5e/0x260 [drm_display_helper 0e67723696438d8e02b741593dd50d80b44c2026] ? drm_dp_atomic_find_time_slots+0x28/0x260 [drm_display_helper 0e67723696438d8e02b741593dd50d80b44c2026] compute_mst_dsc_configs_for_link+0x2ff/0xa40 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] ? fill_plane_buffer_attributes+0x419/0x510 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] compute_mst_dsc_configs_for_state+0x1e1/0x250 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] amdgpu_dm_atomic_check+0xecd/0x1190 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] drm_atomic_check_only+0x5c5/0xa40 drm_mode_atomic_ioctl+0x76e/0xbc0 [how] dsc recompute should be skipped if no mode change detected on the new request. If detected, keep checking whether the stream is already on current state or not. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Rodrigo Siqueira Signed-off-by: Fangzhi Zuo Signed-off-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 8151a6c13111b465dbabe07c19f572f7cbd16fef) --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 5442da90f508..915eb2c08ece 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -1270,6 +1270,9 @@ static bool is_dsc_need_re_compute( } } + if (new_stream_on_link_num == 0) + return false; + /* check current_state if there stream on link but it is not in * new request state */ From 4a5ad08f537703c35cf7cc29845381805c891d9b Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Sat, 3 Aug 2024 21:30:18 +0530 Subject: [PATCH 0626/1052] drm/amdgpu: Add address alignment support to DCC buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add address alignment support to the DCC VRAM buffers. v2: - adjust size based on the max_texture_channel_caches values only for GFX12 DCC buffers. - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only for DCC buffers. - roundup non power of two DCC buffer adjusted size to nearest power of two number as the buddy allocator does not support non power of two alignments. This applies only to the contiguous DCC buffers. v3:(Alex) - rewrite the max texture channel caches comparison code in an algorithmic way to determine the alignment size. v4:(Alex) - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c and add a new gmc func callback for dcc alignment. If the callback is non-NULL, call it to get the alignment, otherwise, use the default. v5:(Alex) - Set the Alignment to a default value if the callback doesn't exist. - Add the callback to amdgpu_gmc_funcs. v6: - Fix checkpatch warning reported by Intel CI. v7:(Christian) - remove the AMDGPU_GEM_CREATE_GFX12_DCC flag and keep a flag that checks the BO pinning and for a specific hw generation. v8:(Christian) - move this check into gmc_v12_0_get_dcc_alignment. v9: - Fix 32bit build errors Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König Reviewed-by: Frank Min Signed-off-by: Alex Deucher (cherry picked from commit aa94b623cb9233b91ed342dd87ecd62e56ff4938) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 35 ++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 18 ++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index febca3130497..4d951a1baefa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs { uint64_t addr, uint64_t *flags); /* get the amount of memory used by the vbios for pre-OS console */ unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); + /* get the DCC buffer alignment */ + unsigned int (*get_dcc_alignment)(struct amdgpu_device *adev); enum amdgpu_memory_partition (*query_mem_partition_mode)( struct amdgpu_device *adev); @@ -363,6 +365,10 @@ struct amdgpu_gmc { (adev)->gmc.gmc_funcs->override_vm_pte_flags \ ((adev), (vm), (addr), (pte_flags)) #define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) +#define amdgpu_gmc_get_dcc_alignment(adev) ({ \ + typeof(adev) _adev = (adev); \ + _adev->gmc.gmc_funcs->get_dcc_alignment(_adev); \ +}) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index f91cc149d06c..b2c94f12da9e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -456,6 +456,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, u64 vis_usage = 0, max_bytes, min_block_size; struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; + unsigned int adjust_dcc_size = 0; struct drm_buddy *mm = &mgr->mm; struct drm_buddy_block *block; unsigned long pages_per_block; @@ -511,7 +512,18 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, /* Allocate blocks in desired range */ vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + if (adev->gmc.gmc_funcs->get_dcc_alignment) + adjust_dcc_size = amdgpu_gmc_get_dcc_alignment(adev); + remaining_size = (u64)vres->base.size; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) { + unsigned int dcc_size; + + dcc_size = roundup_pow_of_two(vres->base.size + adjust_dcc_size); + remaining_size = (u64)dcc_size; + + vres->flags |= DRM_BUDDY_TRIM_DISABLE; + } mutex_lock(&mgr->lock); while (remaining_size) { @@ -521,8 +533,11 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size = mgr->default_page_size; size = remaining_size; - if ((size >= (u64)pages_per_block << PAGE_SHIFT) && - !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) + min_block_size = size; + else if ((size >= (u64)pages_per_block << PAGE_SHIFT) && + !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; BUG_ON(min_block_size < mm->chunk_size); @@ -553,6 +568,22 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, } mutex_unlock(&mgr->lock); + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) { + struct drm_buddy_block *dcc_block; + unsigned long dcc_start; + u64 trim_start; + + dcc_block = amdgpu_vram_mgr_first_block(&vres->blocks); + /* Adjust the start address for DCC buffers only */ + dcc_start = + roundup((unsigned long)amdgpu_vram_mgr_block_start(dcc_block), + adjust_dcc_size); + trim_start = (u64)dcc_start; + drm_buddy_block_trim(mm, &trim_start, + (u64)vres->base.size, + &vres->blocks); + } + vres->base.start = 0; size = max_t(u64, amdgpu_vram_mgr_blocks_size(&vres->blocks), vres->base.size); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index fd3ac483760e..26efce9aa410 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -542,6 +542,23 @@ static unsigned gmc_v12_0_get_vbios_fb_size(struct amdgpu_device *adev) return 0; } +static unsigned int gmc_v12_0_get_dcc_alignment(struct amdgpu_device *adev) +{ + unsigned int max_tex_channel_caches, alignment; + + if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(12, 0, 0) && + amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(12, 0, 1)) + return 0; + + max_tex_channel_caches = adev->gfx.config.max_texture_channel_caches; + if (is_power_of_2(max_tex_channel_caches)) + alignment = (unsigned int)(max_tex_channel_caches / SZ_4); + else + alignment = roundup_pow_of_two(max_tex_channel_caches); + + return (unsigned int)(alignment * max_tex_channel_caches * SZ_1K); +} + static const struct amdgpu_gmc_funcs gmc_v12_0_gmc_funcs = { .flush_gpu_tlb = gmc_v12_0_flush_gpu_tlb, .flush_gpu_tlb_pasid = gmc_v12_0_flush_gpu_tlb_pasid, @@ -551,6 +568,7 @@ static const struct amdgpu_gmc_funcs gmc_v12_0_gmc_funcs = { .get_vm_pde = gmc_v12_0_get_vm_pde, .get_vm_pte = gmc_v12_0_get_vm_pte, .get_vbios_fb_size = gmc_v12_0_get_vbios_fb_size, + .get_dcc_alignment = gmc_v12_0_get_dcc_alignment, }; static void gmc_v12_0_set_gmc_funcs(struct amdgpu_device *adev) From 7fc5f252c0d21b7b89720386344b614733edab32 Mon Sep 17 00:00:00 2001 From: Frank Min Date: Thu, 1 Aug 2024 12:20:18 +0800 Subject: [PATCH 0627/1052] drm/amdgpu: correct sdma7 max dw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit correct sdma7 max dw into 8 Signed-off-by: Frank Min Acked-by: Christian König Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 86598c3819fdc70e59d28221bfa7bc36e9f5777e) --- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 7e4282609f51..ecee9e7d7e4c 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1617,7 +1617,7 @@ static void sdma_v7_0_emit_fill_buffer(struct amdgpu_ib *ib, static const struct amdgpu_buffer_funcs sdma_v7_0_buffer_funcs = { .copy_max_bytes = 0x400000, - .copy_num_dw = 7, + .copy_num_dw = 8, .emit_copy_buffer = sdma_v7_0_emit_copy_buffer, .fill_max_bytes = 0x400000, .fill_num_dw = 5, From 6ad9dafba19f15a64f71c2e1a9e3b6932f96628e Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Mon, 5 Aug 2024 19:17:04 +0530 Subject: [PATCH 0628/1052] drm/amdgpu: Add DCC GFX12 flag to enable address alignment We require this flag AMDGPU_GEM_CREATE_GFX12_DCC or any other kernel level GFX12 DCC flag to differentiate the DCC buffers and other pinned display buffers(which has TTM_PL_FLAG_CONTIGUOUS enabled). If we use the TTM_PL_FLAG_CONTIGUOUS flag for DCC buffers, we may over allocate for all the pinned display buffers unnecessarily that leads to memory allocation failure. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 46142cc1b9272d664e0258e105b537735bfeeccc) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index b2c94f12da9e..7d26a962f811 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -512,7 +512,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, /* Allocate blocks in desired range */ vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; - if (adev->gmc.gmc_funcs->get_dcc_alignment) + if (bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC && + adev->gmc.gmc_funcs->get_dcc_alignment) adjust_dcc_size = amdgpu_gmc_get_dcc_alignment(adev); remaining_size = (u64)vres->base.size; From 730bbfaf7d4890bd99e637db7767dc68cfeb24e7 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sun, 4 Aug 2024 13:36:11 +0200 Subject: [PATCH 0629/1052] spi: spi-fsl-lpspi: Fix scldiv calculation The effective SPI clock frequency should never exceed speed_hz otherwise this might result in undefined behavior of the SPI device. Currently the scldiv calculation could violate this constraint. For the example parameters perclk_rate = 24 MHz and speed_hz = 7 MHz, the function fsl_lpspi_set_bitrate will determine perscale = 0 and scldiv = 1, which is a effective SPI clock of 8 MHz. So fix this by rounding up the quotient of perclk_rate and speed_hz. While this never change within the loop, we can pull this out. Fixes: 5314987de5e5 ("spi: imx: add lpspi bus driver") Signed-off-by: Stefan Wahren Link: https://patch.msgid.link/20240804113611.83613-1-wahrenst@gmx.net Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-lpspi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 32baa14dfd83..be261ac09df8 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -296,7 +296,7 @@ static void fsl_lpspi_set_watermark(struct fsl_lpspi_data *fsl_lpspi) static int fsl_lpspi_set_bitrate(struct fsl_lpspi_data *fsl_lpspi) { struct lpspi_config config = fsl_lpspi->config; - unsigned int perclk_rate, scldiv; + unsigned int perclk_rate, scldiv, div; u8 prescale; perclk_rate = clk_get_rate(fsl_lpspi->clk_per); @@ -313,8 +313,10 @@ static int fsl_lpspi_set_bitrate(struct fsl_lpspi_data *fsl_lpspi) return -EINVAL; } + div = DIV_ROUND_UP(perclk_rate, config.speed_hz); + for (prescale = 0; prescale < 8; prescale++) { - scldiv = perclk_rate / config.speed_hz / (1 << prescale) - 2; + scldiv = div / (1 << prescale) - 2; if (scldiv < 256) { fsl_lpspi->config.prescale = prescale; break; From 0df2ac59bebfac221463ef57ed3554899b41d75f Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 7 Aug 2024 13:51:38 +0200 Subject: [PATCH 0630/1052] tracefs: Fix inode allocation The leading comment above alloc_inode_sb() is pretty explicit about it: /* * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ Switch tracefs over to alloc_inode_sb() to make sure inodes are properly linked. Cc: Ajay Kaher Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Al Viro Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20240807115143.45927-2-minipli@grsecurity.net Fixes: ba37ff75e04b ("eventfs: Implement tracefs_inode_cache") Signed-off-by: Mathias Krause Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 1028ab6d9a74..21a7e51fc3c1 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -42,7 +42,7 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) struct tracefs_inode *ti; unsigned long flags; - ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL); + ti = alloc_inode_sb(sb, tracefs_inode_cachep, GFP_KERNEL); if (!ti) return NULL; From 12c20c65d0460cf34f9a665d8f0c0d77d45a3829 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Tue, 23 Jul 2024 14:25:21 +0200 Subject: [PATCH 0631/1052] eventfs: Don't return NULL in eventfs_create_dir() Commit 77a06c33a22d ("eventfs: Test for ei->is_freed when accessing ei->dentry") added another check, testing if the parent was freed after we released the mutex. If so, the function returns NULL. However, all callers expect it to either return a valid pointer or an error pointer, at least since commit 5264a2f4bb3b ("tracing: Fix a NULL vs IS_ERR() bug in event_subsystem_dir()"). Returning NULL will therefore fail the error condition check in the caller. Fix this by substituting the NULL return value with a fitting error pointer. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: stable@vger.kernel.org Fixes: 77a06c33a22d ("eventfs: Test for ei->is_freed when accessing ei->dentry") Link: https://lore.kernel.org/20240723122522.2724-1-minipli@grsecurity.net Reviewed-by: Dan Carpenter Reviewed-by: Ajay Kaher Signed-off-by: Mathias Krause Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 5d88c184f0fc..a9c28a1d5dc8 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -736,7 +736,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode /* Was the parent freed? */ if (list_empty(&ei->list)) { cleanup_ei(ei); - ei = NULL; + ei = ERR_PTR(-EBUSY); } return ei; } From 8e556432477e97ad6179c61b61a32bf5f1af2355 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Tue, 23 Jul 2024 23:07:53 +0200 Subject: [PATCH 0632/1052] eventfs: Use SRCU for freeing eventfs_inodes To mirror the SRCU lock held in eventfs_iterate() when iterating over eventfs inodes, use call_srcu() to free them too. This was accidentally(?) degraded to RCU in commit 43aa6f97c2d0 ("eventfs: Get rid of dentry pointers without refcounts"). Cc: Ajay Kaher Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20240723210755.8970-1-minipli@grsecurity.net Fixes: 43aa6f97c2d0 ("eventfs: Get rid of dentry pointers without refcounts") Signed-off-by: Mathias Krause Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index a9c28a1d5dc8..01e99e98457d 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -112,7 +112,7 @@ static void release_ei(struct kref *ref) entry->release(entry->name, ei->data); } - call_rcu(&ei->rcu, free_ei_rcu); + call_srcu(&eventfs_srcu, &ei->rcu, free_ei_rcu); } static inline void put_ei(struct eventfs_inode *ei) From 604b72b32522d548f855ed82842d2e49bf384edb Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Sat, 3 Aug 2024 15:09:26 +0200 Subject: [PATCH 0633/1052] function_graph: Fix the ret_stack used by ftrace_graph_ret_addr() When ftrace_graph_ret_addr() is invoked to convert a found stack return address to its original value, the function can end up producing the following crash: [ 95.442712] BUG: kernel NULL pointer dereference, address: 0000000000000028 [ 95.442720] #PF: supervisor read access in kernel mode [ 95.442724] #PF: error_code(0x0000) - not-present page [ 95.442727] PGD 0 P4D 0- [ 95.442731] Oops: Oops: 0000 [#1] PREEMPT SMP PTI [ 95.442736] CPU: 1 UID: 0 PID: 2214 Comm: insmod Kdump: loaded Tainted: G OE K 6.11.0-rc1-default #1 67c62a3b3720562f7e7db5f11c1fdb40b7a2857c [ 95.442747] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE, [K]=LIVEPATCH [ 95.442750] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 [ 95.442754] RIP: 0010:ftrace_graph_ret_addr+0x42/0xc0 [ 95.442766] Code: [...] [ 95.442773] RSP: 0018:ffff979b80ff7718 EFLAGS: 00010006 [ 95.442776] RAX: ffffffff8ca99b10 RBX: ffff979b80ff7760 RCX: ffff979b80167dc0 [ 95.442780] RDX: ffffffff8ca99b10 RSI: ffff979b80ff7790 RDI: 0000000000000005 [ 95.442783] RBP: 0000000000000001 R08: 0000000000000005 R09: 0000000000000000 [ 95.442786] R10: 0000000000000005 R11: 0000000000000000 R12: ffffffff8e9491e0 [ 95.442790] R13: ffffffff8d6f70f0 R14: ffff979b80167da8 R15: ffff979b80167dc8 [ 95.442793] FS: 00007fbf83895740(0000) GS:ffff8a0afdd00000(0000) knlGS:0000000000000000 [ 95.442797] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 95.442800] CR2: 0000000000000028 CR3: 0000000005070002 CR4: 0000000000370ef0 [ 95.442806] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 95.442809] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 95.442816] Call Trace: [ 95.442823] [ 95.442896] unwind_next_frame+0x20d/0x830 [ 95.442905] arch_stack_walk_reliable+0x94/0xe0 [ 95.442917] stack_trace_save_tsk_reliable+0x7d/0xe0 [ 95.442922] klp_check_and_switch_task+0x55/0x1a0 [ 95.442931] task_call_func+0xd3/0xe0 [ 95.442938] klp_try_switch_task.part.5+0x37/0x150 [ 95.442942] klp_try_complete_transition+0x79/0x2d0 [ 95.442947] klp_enable_patch+0x4db/0x890 [ 95.442960] do_one_initcall+0x41/0x2e0 [ 95.442968] do_init_module+0x60/0x220 [ 95.442975] load_module+0x1ebf/0x1fb0 [ 95.443004] init_module_from_file+0x88/0xc0 [ 95.443010] idempotent_init_module+0x190/0x240 [ 95.443015] __x64_sys_finit_module+0x5b/0xc0 [ 95.443019] do_syscall_64+0x74/0x160 [ 95.443232] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 95.443236] RIP: 0033:0x7fbf82f2c709 [ 95.443241] Code: [...] [ 95.443247] RSP: 002b:00007fffd5ea3b88 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 95.443253] RAX: ffffffffffffffda RBX: 000056359c48e750 RCX: 00007fbf82f2c709 [ 95.443257] RDX: 0000000000000000 RSI: 000056356ed4efc5 RDI: 0000000000000003 [ 95.443260] RBP: 000056356ed4efc5 R08: 0000000000000000 R09: 00007fffd5ea3c10 [ 95.443263] R10: 0000000000000003 R11: 0000000000000246 R12: 0000000000000000 [ 95.443267] R13: 000056359c48e6f0 R14: 0000000000000000 R15: 0000000000000000 [ 95.443272] [ 95.443274] Modules linked in: [...] [ 95.443385] Unloaded tainted modules: intel_uncore_frequency(E):1 isst_if_common(E):1 skx_edac(E):1 [ 95.443414] CR2: 0000000000000028 The bug can be reproduced with kselftests: cd linux/tools/testing/selftests make TARGETS='ftrace livepatch' (cd ftrace; ./ftracetest test.d/ftrace/fgraph-filter.tc) (cd livepatch; ./test-livepatch.sh) The problem is that ftrace_graph_ret_addr() is supposed to operate on the ret_stack of a selected task but wrongly accesses the ret_stack of the current task. Specifically, the above NULL dereference occurs when task->curr_ret_stack is non-zero, but current->ret_stack is NULL. Correct ftrace_graph_ret_addr() to work with the right ret_stack. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Reported-by: Miroslav Benes Link: https://lore.kernel.org/20240803131211.17255-1-petr.pavlu@suse.com Fixes: 7aa1eaef9f42 ("function_graph: Allow multiple users to attach to function graph") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fc205ad167a9..d1d5ea2d0a1b 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -902,7 +902,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, i = *idx ? : task->curr_ret_stack; while (i > 0) { - ret_stack = get_ret_stack(current, i, &i); + ret_stack = get_ret_stack(task, i, &i); if (!ret_stack) break; /* From bcf86c01ca4676316557dd482c8416ece8c2e143 Mon Sep 17 00:00:00 2001 From: Tze-nan Wu Date: Mon, 5 Aug 2024 13:59:22 +0800 Subject: [PATCH 0634/1052] tracing: Fix overflow in get_free_elt() "tracing_map->next_elt" in get_free_elt() is at risk of overflowing. Once it overflows, new elements can still be inserted into the tracing_map even though the maximum number of elements (`max_elts`) has been reached. Continuing to insert elements after the overflow could result in the tracing_map containing "tracing_map->max_size" elements, leaving no empty entries. If any attempt is made to insert an element into a full tracing_map using `__tracing_map_insert()`, it will cause an infinite loop with preemption disabled, leading to a CPU hang problem. Fix this by preventing any further increments to "tracing_map->next_elt" once it reaches "tracing_map->max_elt". Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Fixes: 08d43a5fa063e ("tracing: Add lock-free tracing_map") Co-developed-by: Cheng-Jui Wang Link: https://lore.kernel.org/20240805055922.6277-1-Tze-nan.Wu@mediatek.com Signed-off-by: Cheng-Jui Wang Signed-off-by: Tze-nan Wu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index a4dcf0f24352..3a56e7c8aa4f 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map) struct tracing_map_elt *elt = NULL; int idx; - idx = atomic_inc_return(&map->next_elt); + idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts); if (idx < map->max_elts) { elt = *(TRACING_MAP_ELT(map->elts, idx)); if (map->ops && map->ops->elt_init) @@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map) { unsigned int i; - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); atomic64_set(&map->hits, 0); atomic64_set(&map->drops, 0); @@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits, map->map_bits = map_bits; map->max_elts = (1 << map_bits); - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); map->map_size = (1 << (map_bits + 1)); map->ops = ops; From 58f7e4d7ba32758b861807e77535853cacc1f426 Mon Sep 17 00:00:00 2001 From: Jianhui Zhou <912460177@qq.com> Date: Mon, 5 Aug 2024 19:36:31 +0800 Subject: [PATCH 0635/1052] ring-buffer: Remove unused function ring_buffer_nr_pages() Because ring_buffer_nr_pages() is not an inline function and user accesses buffer->buffers[cpu]->nr_pages directly, the function ring_buffer_nr_pages is removed. Signed-off-by: Jianhui Zhou <912460177@qq.com> Link: https://lore.kernel.org/tencent_F4A7E9AB337F44E0F4B858D07D19EF460708@qq.com Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 - kernel/trace/ring_buffer.c | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 96d2140b471e..fd35d4ec12e1 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -193,7 +193,6 @@ void ring_buffer_set_clock(struct trace_buffer *buffer, void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs); bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer); -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu); size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu); struct buffer_data_read_page; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 28853966aa9a..cebd879a30cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -692,18 +692,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, return ts; } -/** - * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer - * @buffer: The ring_buffer to get the number of pages from - * @cpu: The cpu of the ring_buffer to get the number of pages from - * - * Returns the number of pages used by a per_cpu buffer of the ring buffer. - */ -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) -{ - return buffer->buffers[cpu]->nr_pages; -} - /** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from From 0b6743bd60a56a701070b89fb80c327a44b7b3e2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 7 Aug 2024 18:54:02 -0400 Subject: [PATCH 0636/1052] tracefs: Use generic inode RCU for synchronizing freeing With structure layout randomization enabled for 'struct inode' we need to avoid overlapping any of the RCU-used / initialized-only-once members, e.g. i_lru or i_sb_list to not corrupt related list traversals when making use of the rcu_head. For an unlucky structure layout of 'struct inode' we may end up with the following splat when running the ftrace selftests: [<...>] list_del corruption, ffff888103ee2cb0->next (tracefs_inode_cache+0x0/0x4e0 [slab object]) is NULL (prev is tracefs_inode_cache+0x78/0x4e0 [slab object]) [<...>] ------------[ cut here ]------------ [<...>] kernel BUG at lib/list_debug.c:54! [<...>] invalid opcode: 0000 [#1] PREEMPT SMP KASAN [<...>] CPU: 3 PID: 2550 Comm: mount Tainted: G N 6.8.12-grsec+ #122 ed2f536ca62f28b087b90e3cc906a8d25b3ddc65 [<...>] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 [<...>] RIP: 0010:[] __list_del_entry_valid_or_report+0x138/0x3e0 [<...>] Code: 48 b8 99 fb 65 f2 ff ff ff ff e9 03 5c d9 fc cc 48 b8 99 fb 65 f2 ff ff ff ff e9 33 5a d9 fc cc 48 b8 99 fb 65 f2 ff ff ff ff <0f> 0b 4c 89 e9 48 89 ea 48 89 ee 48 c7 c7 60 8f dd 89 31 c0 e8 2f [<...>] RSP: 0018:fffffe80416afaf0 EFLAGS: 00010283 [<...>] RAX: 0000000000000098 RBX: ffff888103ee2cb0 RCX: 0000000000000000 [<...>] RDX: ffffffff84655fe8 RSI: ffffffff89dd8b60 RDI: 0000000000000001 [<...>] RBP: ffff888103ee2cb0 R08: 0000000000000001 R09: fffffbd0082d5f25 [<...>] R10: fffffe80416af92f R11: 0000000000000001 R12: fdf99c16731d9b6d [<...>] R13: 0000000000000000 R14: ffff88819ad4b8b8 R15: 0000000000000000 [<...>] RBX: tracefs_inode_cache+0x0/0x4e0 [slab object] [<...>] RDX: __list_del_entry_valid_or_report+0x108/0x3e0 [<...>] RSI: __func__.47+0x4340/0x4400 [<...>] RBP: tracefs_inode_cache+0x0/0x4e0 [slab object] [<...>] RSP: process kstack fffffe80416afaf0+0x7af0/0x8000 [mount 2550 2550] [<...>] R09: kasan shadow of process kstack fffffe80416af928+0x7928/0x8000 [mount 2550 2550] [<...>] R10: process kstack fffffe80416af92f+0x792f/0x8000 [mount 2550 2550] [<...>] R14: tracefs_inode_cache+0x78/0x4e0 [slab object] [<...>] FS: 00006dcb380c1840(0000) GS:ffff8881e0600000(0000) knlGS:0000000000000000 [<...>] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [<...>] CR2: 000076ab72b30e84 CR3: 000000000b088004 CR4: 0000000000360ef0 shadow CR4: 0000000000360ef0 [<...>] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [<...>] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [<...>] ASID: 0003 [<...>] Stack: [<...>] ffffffff818a2315 00000000f5c856ee ffffffff896f1840 ffff888103ee2cb0 [<...>] ffff88812b6b9750 0000000079d714b6 fffffbfff1e9280b ffffffff8f49405f [<...>] 0000000000000001 0000000000000000 ffff888104457280 ffffffff8248b392 [<...>] Call Trace: [<...>] [<...>] [] ? lock_release+0x175/0x380 fffffe80416afaf0 [<...>] [] list_lru_del+0x152/0x740 fffffe80416afb48 [<...>] [] list_lru_del_obj+0x113/0x280 fffffe80416afb88 [<...>] [] ? _atomic_dec_and_lock+0x119/0x200 fffffe80416afb90 [<...>] [] iput_final+0x1c4/0x9a0 fffffe80416afbb8 [<...>] [] dentry_unlink_inode+0x44b/0xaa0 fffffe80416afbf8 [<...>] [] __dentry_kill+0x23c/0xf00 fffffe80416afc40 [<...>] [] ? __this_cpu_preempt_check+0x1f/0xa0 fffffe80416afc48 [<...>] [] ? shrink_dentry_list+0x1c5/0x760 fffffe80416afc70 [<...>] [] ? shrink_dentry_list+0x51/0x760 fffffe80416afc78 [<...>] [] shrink_dentry_list+0x288/0x760 fffffe80416afc80 [<...>] [] shrink_dcache_sb+0x155/0x420 fffffe80416afcc8 [<...>] [] ? debug_smp_processor_id+0x23/0xa0 fffffe80416afce0 [<...>] [] ? do_one_tree+0x140/0x140 fffffe80416afcf8 [<...>] [] ? do_remount+0x329/0xa00 fffffe80416afd18 [<...>] [] ? security_sb_remount+0x81/0x1c0 fffffe80416afd38 [<...>] [] reconfigure_super+0x856/0x14e0 fffffe80416afd70 [<...>] [] ? ns_capable_common+0xe7/0x2a0 fffffe80416afd90 [<...>] [] do_remount+0x416/0xa00 fffffe80416afdd0 [<...>] [] path_mount+0x5c4/0x900 fffffe80416afe28 [<...>] [] ? finish_automount+0x13a0/0x13a0 fffffe80416afe60 [<...>] [] ? user_path_at_empty+0xb2/0x140 fffffe80416afe88 [<...>] [] do_mount+0x115/0x1c0 fffffe80416afeb8 [<...>] [] ? path_mount+0x900/0x900 fffffe80416afed8 [<...>] [] ? __kasan_check_write+0x1c/0xa0 fffffe80416afee0 [<...>] [] __do_sys_mount+0x12f/0x280 fffffe80416aff30 [<...>] [] __x64_sys_mount+0xcd/0x2e0 fffffe80416aff70 [<...>] [] ? syscall_trace_enter+0x218/0x380 fffffe80416aff88 [<...>] [] x64_sys_call+0x5d5e/0x6720 fffffe80416affa8 [<...>] [] do_syscall_64+0xcd/0x3c0 fffffe80416affb8 [<...>] [] entry_SYSCALL_64_safe_stack+0x4c/0x87 fffffe80416affe8 [<...>] [<...>] [<...>] RIP: 0033:[<00006dcb382ff66a>] vm_area_struct[mount 2550 2550 file 6dcb38225000-6dcb3837e000 22 55(read|exec|mayread|mayexec)]+0x0/0xb8 [userland map] [<...>] Code: 48 8b 0d 29 18 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f6 17 0d 00 f7 d8 64 89 01 48 [<...>] RSP: 002b:0000763d68192558 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [<...>] RAX: ffffffffffffffda RBX: 00006dcb38433264 RCX: 00006dcb382ff66a [<...>] RDX: 000017c3e0d11210 RSI: 000017c3e0d1a5a0 RDI: 000017c3e0d1ae70 [<...>] RBP: 000017c3e0d10fb0 R08: 000017c3e0d11260 R09: 00006dcb383d1be0 [<...>] R10: 000000000020002e R11: 0000000000000246 R12: 0000000000000000 [<...>] R13: 000017c3e0d1ae70 R14: 000017c3e0d11210 R15: 000017c3e0d10fb0 [<...>] RBX: vm_area_struct[mount 2550 2550 file 6dcb38433000-6dcb38434000 5b 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] RCX: vm_area_struct[mount 2550 2550 file 6dcb38225000-6dcb3837e000 22 55(read|exec|mayread|mayexec)]+0x0/0xb8 [userland map] [<...>] RDX: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] RSI: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] RDI: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] RBP: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] RSP: vm_area_struct[mount 2550 2550 anon 763d68173000-763d68195000 7ffffffdd 100133(read|write|mayread|maywrite|growsdown|account)]+0x0/0xb8 [userland map] [<...>] R08: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] R09: vm_area_struct[mount 2550 2550 file 6dcb383d1000-6dcb383d3000 1cd 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] R13: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] R14: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] R15: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map] [<...>] [<...>] Modules linked in: [<...>] ---[ end trace 0000000000000000 ]--- The list debug message as well as RBX's symbolic value point out that the object in question was allocated from 'tracefs_inode_cache' and that the list's '->next' member is at offset 0. Dumping the layout of the relevant parts of 'struct tracefs_inode' gives the following: struct tracefs_inode { union { struct inode { struct list_head { struct list_head * next; /* 0 8 */ struct list_head * prev; /* 8 8 */ } i_lru; [...] } vfs_inode; struct callback_head { void (*func)(struct callback_head *); /* 0 8 */ struct callback_head * next; /* 8 8 */ } rcu; }; [...] }; Above shows that 'vfs_inode.i_lru' overlaps with 'rcu' which will destroy the 'i_lru' list as soon as the 'rcu' member gets used, e.g. in call_rcu() or later when calling the RCU callback. This will disturb concurrent list traversals as well as object reuse which assumes these list heads will keep their integrity. For reproduction, the following diff manually overlays 'i_lru' with 'rcu' as, otherwise, one would require some good portion of luck for gambling an unlucky RANDSTRUCT seed: --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -629,6 +629,7 @@ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; + struct list_head i_lru; /* inode LRU list */ kgid_t i_gid; unsigned int i_flags; @@ -690,7 +691,6 @@ struct inode { u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif - struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { The tracefs inode does not need to supply its own RCU delayed destruction of its inode. The inode code itself offers both a "destroy_inode()" callback that gets called when the last reference of the inode is released, and the "free_inode()" which is called after a RCU synchronization period from the "destroy_inode()". The tracefs code can unlink the inode from its list in the destroy_inode() callback, and the simply free it from the free_inode() callback. This should provide the same protection. Link: https://lore.kernel.org/all/20240807115143.45927-3-minipli@grsecurity.net/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Ajay Kaher Cc: Ilkka =?utf-8?b?TmF1bGFww6TDpA==?= Link: https://lore.kernel.org/20240807185402.61410544@gandalf.local.home Fixes: baa23a8d4360 ("tracefs: Reset permissions on remount if permissions are options") Reported-by: Mathias Krause Reported-by: Brad Spengler Suggested-by: Al Viro Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 10 ++++------ fs/tracefs/internal.h | 5 +---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 21a7e51fc3c1..1748dff58c3b 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -53,15 +53,14 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) return &ti->vfs_inode; } -static void tracefs_free_inode_rcu(struct rcu_head *rcu) +static void tracefs_free_inode(struct inode *inode) { - struct tracefs_inode *ti; + struct tracefs_inode *ti = get_tracefs(inode); - ti = container_of(rcu, struct tracefs_inode, rcu); kmem_cache_free(tracefs_inode_cachep, ti); } -static void tracefs_free_inode(struct inode *inode) +static void tracefs_destroy_inode(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); unsigned long flags; @@ -69,8 +68,6 @@ static void tracefs_free_inode(struct inode *inode) spin_lock_irqsave(&tracefs_inode_lock, flags); list_del_rcu(&ti->list); spin_unlock_irqrestore(&tracefs_inode_lock, flags); - - call_rcu(&ti->rcu, tracefs_free_inode_rcu); } static ssize_t default_read_file(struct file *file, char __user *buf, @@ -437,6 +434,7 @@ static int tracefs_drop_inode(struct inode *inode) static const struct super_operations tracefs_super_operations = { .alloc_inode = tracefs_alloc_inode, .free_inode = tracefs_free_inode, + .destroy_inode = tracefs_destroy_inode, .drop_inode = tracefs_drop_inode, .statfs = simple_statfs, .show_options = tracefs_show_options, diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index f704d8348357..d83c2a25f288 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -10,10 +10,7 @@ enum { }; struct tracefs_inode { - union { - struct inode vfs_inode; - struct rcu_head rcu; - }; + struct inode vfs_inode; /* The below gets initialized with memset_after(ti, 0, vfs_inode) */ struct list_head list; unsigned long flags; From 6d496e02b4a70926c3bd4e7ab6249ff262eb3bc0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2024 18:03:54 -0400 Subject: [PATCH 0637/1052] bcachefs: Add missing path_traverse() to btree_iter_next_node() This fixes a bug exposed by the next path - we pop an assert in path_set_should_be_locked(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 36872207f09b..aa8a049071f4 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1921,6 +1921,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) bch2_trans_verify_not_in_restart(trans); bch2_btree_iter_verify(iter); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) + goto err; + + struct btree_path *path = btree_iter_path(trans, iter); /* already at end? */ From cecf72798b25fcb00303392407fccf500a746747 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2024 13:58:57 -0400 Subject: [PATCH 0638/1052] bcachefs: Make allocator stuck timeout configurable, ratelimit messages Limit these messages to once every 2 minutes to avoid spamming logs; with multiple devices the output can be quite significant. Also, up the default timeout to 30 seconds from 10 seconds. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 26 ++++++++++++++++++++++++-- fs/bcachefs/alloc_foreground.h | 7 ++++++- fs/bcachefs/bcachefs.h | 2 ++ fs/bcachefs/bcachefs_format.h | 2 ++ fs/bcachefs/io_misc.c | 6 +----- fs/bcachefs/io_write.c | 5 +---- fs/bcachefs/opts.h | 5 +++++ fs/bcachefs/super-io.c | 4 ++++ 8 files changed, 45 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 8683fe4fae5b..02de5ad2be2c 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1758,11 +1758,12 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); } -void bch2_print_allocator_stuck(struct bch_fs *c) +static noinline void bch2_print_allocator_stuck(struct bch_fs *c) { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n"); + prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", + c->opts.allocator_stuck_timeout); prt_printf(&buf, "Allocator debug:\n"); printbuf_indent_add(&buf, 2); @@ -1792,3 +1793,24 @@ void bch2_print_allocator_stuck(struct bch_fs *c) bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } + +static inline unsigned allocator_wait_timeout(struct bch_fs *c) +{ + if (c->allocator_last_stuck && + time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) + return 0; + + return c->opts.allocator_stuck_timeout * HZ; +} + +void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + unsigned t = allocator_wait_timeout(c); + + if (t && closure_sync_timeout(cl, t)) { + c->allocator_last_stuck = jiffies; + bch2_print_allocator_stuck(c); + } + + closure_sync(cl); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index c78a64ec0553..386d231ceca3 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -231,6 +231,11 @@ void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); -void bch2_print_allocator_stuck(struct bch_fs *); +void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); +static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + if (cl->closure_get_happened) + __bch2_wait_on_allocator(c, cl); +} #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 91361a167dcd..eedf2d6045e7 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -893,6 +893,8 @@ struct bch_fs { struct bch_fs_usage_base __percpu *usage; u64 __percpu *online_reserved; + unsigned long allocator_last_stuck; + struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 74a60b1a4ddf..ad893684db52 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -836,6 +836,8 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, struct bch_sb, flags[5], 0, 16); +LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, + struct bch_sb, flags[5], 16, 32); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 2cf6297756f8..177ed331c00b 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -126,11 +126,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, if (closure_nr_remaining(&cl) != 1) { bch2_trans_unlock_long(trans); - - if (closure_sync_timeout(&cl, HZ * 10)) { - bch2_print_allocator_stuck(c); - closure_sync(&cl); - } + bch2_wait_on_allocator(c, &cl); } return ret; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index d31c8d006d97..1d4761d15002 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1503,10 +1503,7 @@ static void __bch2_write(struct bch_write_op *op) if ((op->flags & BCH_WRITE_SYNC) || (!(op->flags & BCH_WRITE_SUBMITTED) && !(op->flags & BCH_WRITE_IN_WORKER))) { - if (closure_sync_timeout(&op->cl, HZ * 10)) { - bch2_print_allocator_stuck(c); - closure_sync(&op->cl); - } + bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 60b93018501f..cda1725702ea 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -391,6 +391,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ NULL, "Log transaction function names in journal") \ + x(allocator_stuck_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U16_MAX), \ + BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ + NULL, "Default timeout in seconds for stuck allocator messages")\ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 8bc819832790..c8c2ccbdfbb5 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -414,6 +414,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && + !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) + SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); } for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { From 73dc1656f41a42849e43b945fe44d4e3d55eb6c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2024 16:40:14 -0400 Subject: [PATCH 0639/1052] bcachefs: Use bch2_wait_on_allocator() in btree node alloc path If the allocator gets stuck, we need to know why. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 31ee50184be2..e61f9695771e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1264,7 +1264,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); bch2_trans_unlock(trans); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); } From 30b651c8bc788c068a978dc760e9d5f824f7019e Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 24 Jul 2024 15:35:17 -0600 Subject: [PATCH 0640/1052] selftests: mm: add s390 to ARCH check commit 0518dbe97fe6 ("selftests/mm: fix cross compilation with LLVM") changed the env variable for the architecture from MACHINE to ARCH. This is preventing 3 required TEST_GEN_FILES from being included when cross compiling s390x and errors when trying to run the test suite. This is due to the ARCH variable already being set and the arch folder name being s390. Add "s390" to the filtered list to cover this case and have the 3 files included in the build. Link: https://lkml.kernel.org/r/20240724213517.23918-1-npache@redhat.com Fixes: 0518dbe97fe6 ("selftests/mm: fix cross compilation with LLVM") Signed-off-by: Nico Pache Cc: Mark Brown Cc: Albert Ou Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 901e0d07765b..7b8a5def54a1 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -110,7 +110,7 @@ endif endif -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs From 37bf7fbe1db27792b27345871aa5f8ae52cd396c Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Fri, 26 Jul 2024 09:20:09 +0200 Subject: [PATCH 0641/1052] MAINTAINERS: Update LTP members and web LTP project uses now readthedocs.org instance instead of GitHub wiki. LTP maintainers are listed in alphabetical order. Link: https://lkml.kernel.org/r/20240726072009.1021599-1-pvorel@suse.cz Signed-off-by: Petr Vorel Reviewed-by: Li Wang Reviewed-by: Cyril Hrubis Cc: Jan Stancek Cc: Xiao Yang Cc: Yang Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde38320..31805b6e98a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13324,14 +13324,16 @@ F: Documentation/devicetree/bindings/i2c/i2c-mux-ltc4306.txt F: drivers/i2c/muxes/i2c-mux-ltc4306.c LTP (Linux Test Project) +M: Andrea Cervesato M: Cyril Hrubis M: Jan Stancek M: Petr Vorel M: Li Wang M: Yang Xu +M: Xiao Yang L: ltp@lists.linux.it (subscribers-only) S: Maintained -W: http://linux-test-project.github.io/ +W: https://linux-test-project.readthedocs.io/ T: git https://github.com/linux-test-project/ltp.git LTR390 AMBIENT/UV LIGHT SENSOR DRIVER From 7d4df2dad312f270d62fecb0e5c8b086c6d7dcfc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 29 Jul 2024 04:21:58 +0200 Subject: [PATCH 0642/1052] kcov: properly check for softirq context When collecting coverage from softirqs, KCOV uses in_serving_softirq() to check whether the code is running in the softirq context. Unfortunately, in_serving_softirq() is > 0 even when the code is running in the hardirq or NMI context for hardirqs and NMIs that happened during a softirq. As a result, if a softirq handler contains a remote coverage collection section and a hardirq with another remote coverage collection section happens during handling the softirq, KCOV incorrectly detects a nested softirq coverate collection section and prints a WARNING, as reported by syzbot. This issue was exposed by commit a7f3813e589f ("usb: gadget: dummy_hcd: Switch to hrtimer transfer scheduler"), which switched dummy_hcd to using hrtimer and made the timer's callback be executed in the hardirq context. Change the related checks in KCOV to account for this behavior of in_serving_softirq() and make KCOV ignore remote coverage collection sections in the hardirq and NMI contexts. This prevents the WARNING printed by syzbot but does not fix the inability of KCOV to collect coverage from the __usb_hcd_giveback_urb when dummy_hcd is in use (caused by a7f3813e589f); a separate patch is required for that. Link: https://lkml.kernel.org/r/20240729022158.92059-1-andrey.konovalov@linux.dev Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Andrey Konovalov Reported-by: syzbot+2388cdaeb6b10f0c13ac@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2388cdaeb6b10f0c13ac Acked-by: Marco Elver Cc: Alan Stern Cc: Aleksandr Nogikh Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Marcello Sylvester Bauer Cc: Signed-off-by: Andrew Morton --- kernel/kcov.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index f0a69d402066..274b6b7c718d 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -161,6 +161,15 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, kmsan_unpoison_memory(&area->list, sizeof(area->list)); } +/* + * Unlike in_serving_softirq(), this function returns false when called during + * a hardirq or an NMI that happened in the softirq context. + */ +static inline bool in_softirq_really(void) +{ + return in_serving_softirq() && !in_hardirq() && !in_nmi(); +} + static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -170,7 +179,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru * so we ignore code executed in interrupts, unless we are in a remote * coverage collection section in a softirq. */ - if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) + if (!in_task() && !(in_softirq_really() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -849,7 +858,7 @@ void kcov_remote_start(u64 handle) if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); @@ -991,7 +1000,7 @@ void kcov_remote_stop(void) int sequence; unsigned long flags; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); From 5161b48712dcd08ec427c450399d4d1483e21dea Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Jul 2024 16:36:07 +0800 Subject: [PATCH 0643/1052] mm: list_lru: fix UAF for memory cgroup The mem_cgroup_from_slab_obj() is supposed to be called under rcu lock or cgroup_mutex or others which could prevent returned memcg from being freed. Fix it by adding missing rcu read lock. Found by code inspection. [songmuchun@bytedance.com: only grab rcu lock when necessary, per Vlastimil] Link: https://lkml.kernel.org/r/20240801024603.1865-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20240718083607.42068-1-songmuchun@bytedance.com Fixes: 0a97c01cd20b ("list_lru: allow explicit memcg and NUMA node selection") Signed-off-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Nhat Pham Cc: Signed-off-by: Andrew Morton --- mm/list_lru.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index a29d96929d7c..9b7ff06e9d32 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -85,6 +85,7 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } #endif /* CONFIG_MEMCG */ +/* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { @@ -109,14 +110,22 @@ EXPORT_SYMBOL_GPL(list_lru_add); bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { + bool ret; int nid = page_to_nid(virt_to_page(item)); - struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? - mem_cgroup_from_slab_obj(item) : NULL; - return list_lru_add(lru, item, nid, memcg); + if (list_lru_memcg_aware(lru)) { + rcu_read_lock(); + ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); + rcu_read_unlock(); + } else { + ret = list_lru_add(lru, item, nid, NULL); + } + + return ret; } EXPORT_SYMBOL_GPL(list_lru_add_obj); +/* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { @@ -139,11 +148,18 @@ EXPORT_SYMBOL_GPL(list_lru_del); bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { + bool ret; int nid = page_to_nid(virt_to_page(item)); - struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? - mem_cgroup_from_slab_obj(item) : NULL; - return list_lru_del(lru, item, nid, memcg); + if (list_lru_memcg_aware(lru)) { + rcu_read_lock(); + ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); + rcu_read_unlock(); + } else { + ret = list_lru_del(lru, item, nid, NULL); + } + + return ret; } EXPORT_SYMBOL_GPL(list_lru_del_obj); From b66b1b71d7ff5464d23a0ac6f73fae461b7264fd Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 31 Jul 2024 13:46:19 +0800 Subject: [PATCH 0644/1052] mm: shmem: avoid allocating huge pages larger than MAX_PAGECACHE_ORDER for shmem Similar to commit d659b715e94ac ("mm/huge_memory: avoid PMD-size page cache if needed"), ARM64 can support 512MB PMD-sized THP when the base page size is 64KB, which is larger than the maximum supported page cache size MAX_PAGECACHE_ORDER. This is not expected. To fix this issue, use THP_ORDERS_ALL_FILE_DEFAULT for shmem to filter allowable huge orders. [baolin.wang@linux.alibaba.com: remove comment, per Barry] Link: https://lkml.kernel.org/r/c55d7ef7-78aa-4ed6-b897-c3e03a3f3ab7@linux.alibaba.com [wangkefeng.wang@huawei.com: remove local `orders'] Link: https://lkml.kernel.org/r/87769ae8-b6c6-4454-925d-1864364af9c8@huawei.com Link: https://lkml.kernel.org/r/117121665254442c3c7f585248296495e5e2b45c.1722404078.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Signed-off-by: Kefeng Wang Reviewed-by: Barry Song Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: Gavin Shan Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/shmem.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2faa9daaf54b..b5be73b04329 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1629,11 +1629,6 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma->vm_flags; - /* - * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that - * are enabled for this vma. - */ - unsigned long orders = BIT(PMD_ORDER + 1) - 1; loff_t i_size; int order; @@ -1678,7 +1673,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, if (global_huge) mask |= READ_ONCE(huge_shmem_orders_inherit); - return orders & mask; + return THP_ORDERS_ALL_FILE_DEFAULT & mask; } static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, From 4cbf320b1500fe64fcef8c96ed74dfc1ae2c9e2c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 31 Jul 2024 13:46:20 +0800 Subject: [PATCH 0645/1052] mm: shmem: fix incorrect aligned index when checking conflicts In the shmem_suitable_orders() function, xa_find() is used to check for conflicts in the pagecache to select suitable huge orders. However, when checking each huge order in every loop, the aligned index is calculated from the previous iteration, which may cause suitable huge orders to be missed. We should use the original index each time in the loop to calculate a new aligned index for checking conflicts to avoid this issue. Link: https://lkml.kernel.org/r/07433b0f16a152bffb8cee34934a5c040e8e2ad6.1722404078.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Barry Song <21cnbao@gmail.com> Cc: Gavin Shan Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Zi Yan Cc: Barry Song Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/shmem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b5be73b04329..5a77acf6ac6a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1681,6 +1681,7 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault unsigned long orders) { struct vm_area_struct *vma = vmf->vma; + pgoff_t aligned_index; unsigned long pages; int order; @@ -1692,9 +1693,9 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault order = highest_order(orders); while (orders) { pages = 1UL << order; - index = round_down(index, pages); - if (!xa_find(&mapping->i_pages, &index, - index + pages - 1, XA_PRESENT)) + aligned_index = round_down(index, pages); + if (!xa_find(&mapping->i_pages, &aligned_index, + aligned_index + pages - 1, XA_PRESENT)) break; order = next_order(&orders, order); } From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 2 Aug 2024 16:58:22 -0700 Subject: [PATCH 0646/1052] memcg: protect concurrent access to mem_cgroup_idr Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") decoupled the memcg IDs from the CSS ID space to fix the cgroup creation failures. It introduced IDR to maintain the memcg ID space. The IDR depends on external synchronization mechanisms for modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace() happen within css callback and thus are protected through cgroup_mutex from concurrent modifications. However idr_remove() for mem_cgroup_idr was not protected against concurrency and can be run concurrently for different memcgs when they hit their refcnt to zero. Fix that. We have been seeing list_lru based kernel crashes at a low frequency in our fleet for a long time. These crashes were in different part of list_lru code including list_lru_add(), list_lru_del() and reparenting code. Upon further inspection, it looked like for a given object (dentry and inode), the super_block's list_lru didn't have list_lru_one for the memcg of that object. The initial suspicions were either the object is not allocated through kmem_cache_alloc_lru() or somehow memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but returned success. No evidence were found for these cases. Looking more deeply, we started seeing situations where valid memcg's id is not present in mem_cgroup_idr and in some cases multiple valid memcgs have same id and mem_cgroup_idr is pointing to one of them. So, the most reasonable explanation is that these situations can happen due to race between multiple idr_remove() calls or race between idr_alloc()/idr_replace() and idr_remove(). These races are causing multiple memcgs to acquire the same ID and then offlining of one of them would cleanup list_lrus on the system for all of them. Later access from other memcgs to the list_lru cause crashes due to missing list_lru_one. Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Signed-off-by: Shakeel Butt Acked-by: Muchun Song Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 960371788687..f29157288b7d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3386,11 +3386,28 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) static DEFINE_IDR(mem_cgroup_idr); +static DEFINE_SPINLOCK(memcg_idr_lock); + +static int mem_cgroup_alloc_id(void) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock(&memcg_idr_lock); + ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, + GFP_NOWAIT); + spin_unlock(&memcg_idr_lock); + idr_preload_end(); + return ret; +} static void mem_cgroup_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { + spin_lock(&memcg_idr_lock); idr_remove(&mem_cgroup_idr, memcg->id.id); + spin_unlock(&memcg_idr_lock); + memcg->id.id = 0; } } @@ -3524,8 +3541,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg) return ERR_PTR(error); - memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, - 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); + memcg->id.id = mem_cgroup_alloc_id(); if (memcg->id.id < 0) { error = memcg->id.id; goto fail; @@ -3667,7 +3683,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) * publish it here at the end of onlining. This matches the * regular ID destruction during offlining. */ + spin_lock(&memcg_idr_lock); idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + spin_unlock(&memcg_idr_lock); return 0; offline_kmem: From f2087995fb7165a88b50dde02f3909e448522e0a Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Sun, 4 Aug 2024 14:45:54 +0900 Subject: [PATCH 0647/1052] mailmap: update entry for David Heidelberg Link my old gmail address to my active email. Link: https://lkml.kernel.org/r/20240804054704.859503-1-david@ixit.cz Signed-off-by: David Heidelberg Cc: David S. Miller Cc: Jiri Kosina Cc: Manivannan Sadhasivam Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index e51d76df75c2..8ee01d9d7046 100644 --- a/.mailmap +++ b/.mailmap @@ -166,6 +166,7 @@ Daniel Borkmann Daniel Borkmann David Brownell David Collins +David Heidelberg David Rheinsberg David Rheinsberg David Rheinsberg From 6d45e1c948a8b7ed6ceddb14319af69424db730c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Aug 2024 13:46:47 -0400 Subject: [PATCH 0648/1052] padata: Fix possible divide-by-0 panic in padata_mt_helper() We are hit with a not easily reproducible divide-by-0 panic in padata.c at bootup time. [ 10.017908] Oops: divide error: 0000 1 PREEMPT SMP NOPTI [ 10.017908] CPU: 26 PID: 2627 Comm: kworker/u1666:1 Not tainted 6.10.0-15.el10.x86_64 #1 [ 10.017908] Hardware name: Lenovo ThinkSystem SR950 [7X12CTO1WW]/[7X12CTO1WW], BIOS [PSE140J-2.30] 07/20/2021 [ 10.017908] Workqueue: events_unbound padata_mt_helper [ 10.017908] RIP: 0010:padata_mt_helper+0x39/0xb0 : [ 10.017963] Call Trace: [ 10.017968] [ 10.018004] ? padata_mt_helper+0x39/0xb0 [ 10.018084] process_one_work+0x174/0x330 [ 10.018093] worker_thread+0x266/0x3a0 [ 10.018111] kthread+0xcf/0x100 [ 10.018124] ret_from_fork+0x31/0x50 [ 10.018138] ret_from_fork_asm+0x1a/0x30 [ 10.018147] Looking at the padata_mt_helper() function, the only way a divide-by-0 panic can happen is when ps->chunk_size is 0. The way that chunk_size is initialized in padata_do_multithreaded(), chunk_size can be 0 when the min_chunk in the passed-in padata_mt_job structure is 0. Fix this divide-by-0 panic by making sure that chunk_size will be at least 1 no matter what the input parameters are. Link: https://lkml.kernel.org/r/20240806174647.1050398-1-longman@redhat.com Fixes: 004ed42638f4 ("padata: add basic support for multithreaded jobs") Signed-off-by: Waiman Long Cc: Daniel Jordan Cc: Steffen Klassert Cc: Waiman Long Cc: Signed-off-by: Andrew Morton --- kernel/padata.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/padata.c b/kernel/padata.c index 53f4bc912712..0fa6c2895460 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -517,6 +517,13 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = max(ps.chunk_size, job->min_chunk); ps.chunk_size = roundup(ps.chunk_size, job->align); + /* + * chunk_size can be 0 if the caller sets min_chunk to 0. So force it + * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().` + */ + if (!ps.chunk_size) + ps.chunk_size = 1U; + list_for_each_entry(pw, &works, pw_list) if (job->numa_aware) { int old_node = atomic_read(&last_used_nid); From d27a835f41d947f62e6a95e89ba523299c9e6437 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 5 Aug 2024 12:38:56 +0800 Subject: [PATCH 0649/1052] net/smc: add the max value of fallback reason count The number of fallback reasons defined in the smc_clc.h file has reached 36. For historical reasons, some are no longer quoted, and there's 33 actually in use. So, add the max value of fallback reason count to 36. Fixes: 6ac1e6563f59 ("net/smc: support smc v2.x features validate") Fixes: 7f0620b9940b ("net/smc: support max connections per lgr negotiation") Fixes: 69b888e3bb4b ("net/smc: support max links per lgr negotiation in clc handshake") Signed-off-by: Zhengchao Shao Reviewed-by: Wenjia Zhang Reviewed-by: D. Wythe Link: https://patch.msgid.link/20240805043856.565677-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- net/smc/smc_stats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 9d32058db2b5..e19177ce4092 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -19,7 +19,7 @@ #include "smc_clc.h" -#define SMC_MAX_FBACK_RSN_CNT 30 +#define SMC_MAX_FBACK_RSN_CNT 36 enum { SMC_BUF_8K, From e3862093ee93fcfbdadcb7957f5f8974fffa806a Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Tue, 6 Aug 2024 10:13:27 +0900 Subject: [PATCH 0650/1052] net: dsa: bcm_sf2: Fix a possible memory leak in bcm_sf2_mdio_register() bcm_sf2_mdio_register() calls of_phy_find_device() and then phy_device_remove() in a loop to remove existing PHY devices. of_phy_find_device() eventually calls bus_find_device(), which calls get_device() on the returned struct device * to increment the refcount. The current implementation does not decrement the refcount, which causes memory leak. This commit adds the missing phy_device_free() call to decrement the refcount via put_device() to balance the refcount. Fixes: 771089c2a485 ("net: dsa: bcm_sf2: Ensure that MDIO diversion is used") Signed-off-by: Joe Hattori Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20240806011327.3817861-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- drivers/net/dsa/bcm_sf2.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index ed1e6560df25..0e663ec0c12a 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -675,8 +675,10 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds) of_remove_property(child, prop); phydev = of_phy_find_device(child); - if (phydev) + if (phydev) { phy_device_remove(phydev); + phy_device_free(phydev); + } } err = mdiobus_register(priv->user_mii_bus); From da03f5d1b2c319a2b74fe76edeadcd8fa5f44376 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 5 Aug 2024 22:37:42 -0700 Subject: [PATCH 0651/1052] bnxt_en : Fix memory out-of-bounds in bnxt_fill_hw_rss_tbl() A recent commit has modified the code in __bnxt_reserve_rings() to set the default RSS indirection table to default only when the number of RX rings is changing. While this works for newer firmware that requires RX ring reservations, it causes the regression on older firmware not requiring RX ring resrvations (BNXT_NEW_RM() returns false). With older firmware, RX ring reservations are not required and so hw_resc->resv_rx_rings is not always set to the proper value. The comparison: if (old_rx_rings != bp->hw_resc.resv_rx_rings) in __bnxt_reserve_rings() may be false even when the RX rings are changing. This will cause __bnxt_reserve_rings() to skip setting the default RSS indirection table to default to match the current number of RX rings. This may later cause bnxt_fill_hw_rss_tbl() to use an out-of-range index. We already have bnxt_check_rss_tbl_no_rmgr() to handle exactly this scenario. We just need to move it up in bnxt_need_reserve_rings() to be called unconditionally when using older firmware. Without the fix, if the TX rings are changing, we'll skip the bnxt_check_rss_tbl_no_rmgr() call and __bnxt_reserve_rings() may also skip the bnxt_set_dflt_rss_indir_tbl() call for the reason explained in the last paragraph. Without setting the default RSS indirection table to default, it causes the regression: BUG: KASAN: slab-out-of-bounds in __bnxt_hwrm_vnic_set_rss+0xb79/0xe40 Read of size 2 at addr ffff8881c5809618 by task ethtool/31525 Call Trace: __bnxt_hwrm_vnic_set_rss+0xb79/0xe40 bnxt_hwrm_vnic_rss_cfg_p5+0xf7/0x460 __bnxt_setup_vnic_p5+0x12e/0x270 __bnxt_open_nic+0x2262/0x2f30 bnxt_open_nic+0x5d/0xf0 ethnl_set_channels+0x5d4/0xb30 ethnl_default_set_doit+0x2f1/0x620 Reported-by: Breno Leitao Closes: https://lore.kernel.org/netdev/ZrC6jpghA3PWVWSB@gmail.com/ Fixes: 98ba1d931f61 ("bnxt_en: Fix RSS logic in __bnxt_reserve_rings()") Reviewed-by: Pavan Chebbi Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Tested-by: Breno Leitao Link: https://patch.msgid.link/20240806053742.140304-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 23f74c6c88b9..e27e1082ee33 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7591,19 +7591,20 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp) int rx = bp->rx_nr_rings, stat; int vnic, grp = rx; - if (hw_resc->resv_tx_rings != bp->tx_nr_rings && - bp->hwrm_spec_code >= 0x10601) - return true; - /* Old firmware does not need RX ring reservations but we still * need to setup a default RSS map when needed. With new firmware * we go through RX ring reservations first and then set up the * RSS map for the successfully reserved RX rings when needed. */ - if (!BNXT_NEW_RM(bp)) { + if (!BNXT_NEW_RM(bp)) bnxt_check_rss_tbl_no_rmgr(bp); + + if (hw_resc->resv_tx_rings != bp->tx_nr_rings && + bp->hwrm_spec_code >= 0x10601) + return true; + + if (!BNXT_NEW_RM(bp)) return false; - } vnic = bnxt_get_total_vnics(bp, rx); From f01032a2ca099ec8d619aaa916c3762aa62495df Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 6 Aug 2024 15:09:20 -0700 Subject: [PATCH 0652/1052] idpf: fix memory leaks and crashes while performing a soft reset The second tagged commit introduced a UAF, as it removed restoring q_vector->vport pointers after reinitializating the structures. This is due to that all queue allocation functions are performed here with the new temporary vport structure and those functions rewrite the backpointers to the vport. Then, this new struct is freed and the pointers start leading to nowhere. But generally speaking, the current logic is very fragile. It claims to be more reliable when the system is low on memory, but in fact, it consumes two times more memory as at the moment of running this function, there are two vports allocated with their queues and vectors. Moreover, it claims to prevent the driver from running into "bad state", but in fact, any error during the rebuild leaves the old vport in the partially allocated state. Finally, if the interface is down when the function is called, it always allocates a new queue set, but when the user decides to enable the interface later on, vport_open() allocates them once again, IOW there's a clear memory leak here. Just don't allocate a new queue set when performing a reset, that solves crashes and memory leaks. Readd the old queue number and reopen the interface on rollback - that solves limbo states when the device is left disabled and/or without HW queues enabled. Fixes: 02cbfba1add5 ("idpf: add ethtool callbacks") Fixes: e4891e4687c8 ("idpf: split &idpf_queue into 4 strictly-typed queue structures") Signed-off-by: Alexander Lobakin Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20240806220923.3359860-2-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index 5dbf2b4ba1b0..10b884dd3475 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -1335,9 +1335,8 @@ static void idpf_rx_init_buf_tail(struct idpf_vport *vport) /** * idpf_vport_open - Bring up a vport * @vport: vport to bring up - * @alloc_res: allocate queue resources */ -static int idpf_vport_open(struct idpf_vport *vport, bool alloc_res) +static int idpf_vport_open(struct idpf_vport *vport) { struct idpf_netdev_priv *np = netdev_priv(vport->netdev); struct idpf_adapter *adapter = vport->adapter; @@ -1350,11 +1349,9 @@ static int idpf_vport_open(struct idpf_vport *vport, bool alloc_res) /* we do not allow interface up just yet */ netif_carrier_off(vport->netdev); - if (alloc_res) { - err = idpf_vport_queues_alloc(vport); - if (err) - return err; - } + err = idpf_vport_queues_alloc(vport); + if (err) + return err; err = idpf_vport_intr_alloc(vport); if (err) { @@ -1539,7 +1536,7 @@ void idpf_init_task(struct work_struct *work) np = netdev_priv(vport->netdev); np->state = __IDPF_VPORT_DOWN; if (test_and_clear_bit(IDPF_VPORT_UP_REQUESTED, vport_config->flags)) - idpf_vport_open(vport, true); + idpf_vport_open(vport); /* Spawn and return 'idpf_init_task' work queue until all the * default vports are created @@ -1898,9 +1895,6 @@ int idpf_initiate_soft_reset(struct idpf_vport *vport, goto free_vport; } - err = idpf_vport_queues_alloc(new_vport); - if (err) - goto free_vport; if (current_state <= __IDPF_VPORT_DOWN) { idpf_send_delete_queues_msg(vport); } else { @@ -1932,17 +1926,23 @@ int idpf_initiate_soft_reset(struct idpf_vport *vport, err = idpf_set_real_num_queues(vport); if (err) - goto err_reset; + goto err_open; if (current_state == __IDPF_VPORT_UP) - err = idpf_vport_open(vport, false); + err = idpf_vport_open(vport); kfree(new_vport); return err; err_reset: - idpf_vport_queues_rel(new_vport); + idpf_send_add_queues_msg(vport, vport->num_txq, vport->num_complq, + vport->num_rxq, vport->num_bufq); + +err_open: + if (current_state == __IDPF_VPORT_UP) + idpf_vport_open(vport); + free_vport: kfree(new_vport); @@ -2171,7 +2171,7 @@ static int idpf_open(struct net_device *netdev) idpf_vport_ctrl_lock(netdev); vport = idpf_netdev_to_vport(netdev); - err = idpf_vport_open(vport, true); + err = idpf_vport_open(vport); idpf_vport_ctrl_unlock(netdev); From 3cc88e8405b8d55e0ff035e31971aadd6baee2b6 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Tue, 6 Aug 2024 15:09:21 -0700 Subject: [PATCH 0653/1052] idpf: fix memleak in vport interrupt configuration The initialization of vport interrupt consists of two functions: 1) idpf_vport_intr_init() where a generic configuration is done 2) idpf_vport_intr_req_irq() where the irq for each q_vector is requested. The first function used to create a base name for each interrupt using "kasprintf()" call. Unfortunately, although that call allocated memory for a text buffer, that memory was never released. Fix this by removing creating the interrupt base name in 1). Instead, always create a full interrupt name in the function 2), because there is no need to create a base name separately, considering that the function 2) is never called out of idpf_vport_intr_init() context. Fixes: d4d558718266 ("idpf: initialize interrupts and enable vport") Cc: stable@vger.kernel.org # 6.7 Signed-off-by: Michal Kubiak Reviewed-by: Pavan Kumar Linga Signed-off-by: Alexander Lobakin Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20240806220923.3359860-3-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index af2879f03b8d..a2f9f252694a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3780,13 +3780,15 @@ void idpf_vport_intr_update_itr_ena_irq(struct idpf_q_vector *q_vector) /** * idpf_vport_intr_req_irq - get MSI-X vectors from the OS for the vport * @vport: main vport structure - * @basename: name for the vector */ -static int idpf_vport_intr_req_irq(struct idpf_vport *vport, char *basename) +static int idpf_vport_intr_req_irq(struct idpf_vport *vport) { struct idpf_adapter *adapter = vport->adapter; + const char *drv_name, *if_name, *vec_name; int vector, err, irq_num, vidx; - const char *vec_name; + + drv_name = dev_driver_string(&adapter->pdev->dev); + if_name = netdev_name(vport->netdev); for (vector = 0; vector < vport->num_q_vectors; vector++) { struct idpf_q_vector *q_vector = &vport->q_vectors[vector]; @@ -3804,8 +3806,8 @@ static int idpf_vport_intr_req_irq(struct idpf_vport *vport, char *basename) else continue; - name = kasprintf(GFP_KERNEL, "%s-%s-%d", basename, vec_name, - vidx); + name = kasprintf(GFP_KERNEL, "%s-%s-%s-%d", drv_name, if_name, + vec_name, vidx); err = request_irq(irq_num, idpf_vport_intr_clean_queues, 0, name, q_vector); @@ -4326,7 +4328,6 @@ int idpf_vport_intr_alloc(struct idpf_vport *vport) */ int idpf_vport_intr_init(struct idpf_vport *vport) { - char *int_name; int err; err = idpf_vport_intr_init_vec_idx(vport); @@ -4340,11 +4341,7 @@ int idpf_vport_intr_init(struct idpf_vport *vport) if (err) goto unroll_vectors_alloc; - int_name = kasprintf(GFP_KERNEL, "%s-%s", - dev_driver_string(&vport->adapter->pdev->dev), - vport->netdev->name); - - err = idpf_vport_intr_req_irq(vport, int_name); + err = idpf_vport_intr_req_irq(vport); if (err) goto unroll_vectors_alloc; From 290f1c033281c1a502a3cd1c53c3a549259c491f Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 6 Aug 2024 15:09:22 -0700 Subject: [PATCH 0654/1052] idpf: fix UAFs when destroying the queues The second tagged commit started sometimes (very rarely, but possible) throwing WARNs from net/core/page_pool.c:page_pool_disable_direct_recycling(). Turned out idpf frees interrupt vectors with embedded NAPIs *before* freeing the queues making page_pools' NAPI pointers lead to freed memory before these pools are destroyed by libeth. It's not clear whether there are other accesses to the freed vectors when destroying the queues, but anyway, we usually free queue/interrupt vectors only when the queues are destroyed and the NAPIs are guaranteed to not be referenced anywhere. Invert the allocation and freeing logic making queue/interrupt vectors be allocated first and freed last. Vectors don't require queues to be present, so this is safe. Additionally, this change allows to remove that useless queue->q_vector pointer cleanup, as vectors are still valid when freeing the queues (+ both are freed within one function, so it's not clear why nullify the pointers at all). Fixes: 1c325aac10a8 ("idpf: configure resources for TX queues") Fixes: 90912f9f4f2d ("idpf: convert header split mode to libeth + napi_build_skb()") Reported-by: Michal Kubiak Signed-off-by: Alexander Lobakin Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20240806220923.3359860-4-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 24 ++++++++++----------- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 24 +-------------------- 2 files changed, 13 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index 10b884dd3475..0b6c8fd5bc90 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -900,8 +900,8 @@ static void idpf_vport_stop(struct idpf_vport *vport) vport->link_up = false; idpf_vport_intr_deinit(vport); - idpf_vport_intr_rel(vport); idpf_vport_queues_rel(vport); + idpf_vport_intr_rel(vport); np->state = __IDPF_VPORT_DOWN; } @@ -1349,43 +1349,43 @@ static int idpf_vport_open(struct idpf_vport *vport) /* we do not allow interface up just yet */ netif_carrier_off(vport->netdev); - err = idpf_vport_queues_alloc(vport); - if (err) - return err; - err = idpf_vport_intr_alloc(vport); if (err) { dev_err(&adapter->pdev->dev, "Failed to allocate interrupts for vport %u: %d\n", vport->vport_id, err); - goto queues_rel; + return err; } + err = idpf_vport_queues_alloc(vport); + if (err) + goto intr_rel; + err = idpf_vport_queue_ids_init(vport); if (err) { dev_err(&adapter->pdev->dev, "Failed to initialize queue ids for vport %u: %d\n", vport->vport_id, err); - goto intr_rel; + goto queues_rel; } err = idpf_vport_intr_init(vport); if (err) { dev_err(&adapter->pdev->dev, "Failed to initialize interrupts for vport %u: %d\n", vport->vport_id, err); - goto intr_rel; + goto queues_rel; } err = idpf_rx_bufs_init_all(vport); if (err) { dev_err(&adapter->pdev->dev, "Failed to initialize RX buffers for vport %u: %d\n", vport->vport_id, err); - goto intr_rel; + goto queues_rel; } err = idpf_queue_reg_init(vport); if (err) { dev_err(&adapter->pdev->dev, "Failed to initialize queue registers for vport %u: %d\n", vport->vport_id, err); - goto intr_rel; + goto queues_rel; } idpf_rx_init_buf_tail(vport); @@ -1452,10 +1452,10 @@ static int idpf_vport_open(struct idpf_vport *vport) idpf_send_map_unmap_queue_vector_msg(vport, false); intr_deinit: idpf_vport_intr_deinit(vport); -intr_rel: - idpf_vport_intr_rel(vport); queues_rel: idpf_vport_queues_rel(vport); +intr_rel: + idpf_vport_intr_rel(vport); return err; } diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index a2f9f252694a..585c3dadd9bf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3576,9 +3576,7 @@ static void idpf_vport_intr_napi_dis_all(struct idpf_vport *vport) */ void idpf_vport_intr_rel(struct idpf_vport *vport) { - int i, j, v_idx; - - for (v_idx = 0; v_idx < vport->num_q_vectors; v_idx++) { + for (u32 v_idx = 0; v_idx < vport->num_q_vectors; v_idx++) { struct idpf_q_vector *q_vector = &vport->q_vectors[v_idx]; kfree(q_vector->complq); @@ -3593,26 +3591,6 @@ void idpf_vport_intr_rel(struct idpf_vport *vport) free_cpumask_var(q_vector->affinity_mask); } - /* Clean up the mapping of queues to vectors */ - for (i = 0; i < vport->num_rxq_grp; i++) { - struct idpf_rxq_group *rx_qgrp = &vport->rxq_grps[i]; - - if (idpf_is_queue_model_split(vport->rxq_model)) - for (j = 0; j < rx_qgrp->splitq.num_rxq_sets; j++) - rx_qgrp->splitq.rxq_sets[j]->rxq.q_vector = NULL; - else - for (j = 0; j < rx_qgrp->singleq.num_rxq; j++) - rx_qgrp->singleq.rxqs[j]->q_vector = NULL; - } - - if (idpf_is_queue_model_split(vport->txq_model)) - for (i = 0; i < vport->num_txq_grp; i++) - vport->txq_grps[i].complq->q_vector = NULL; - else - for (i = 0; i < vport->num_txq_grp; i++) - for (j = 0; j < vport->txq_grps[i].num_txq; j++) - vport->txq_grps[i].txqs[j]->q_vector = NULL; - kfree(vport->q_vectors); vport->q_vectors = NULL; } From 74b0666f97f9455bc799405b7874df62fcb66bae Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 6 Aug 2024 13:35:33 +0200 Subject: [PATCH 0655/1052] i2c: testunit: match HostNotify test name with docs Ensure the test has the same name in the code as it has in the docs. Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-testunit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/i2c-slave-testunit.c b/drivers/i2c/i2c-slave-testunit.c index 4e03b75f9ad7..4c550306f3ec 100644 --- a/drivers/i2c/i2c-slave-testunit.c +++ b/drivers/i2c/i2c-slave-testunit.c @@ -18,7 +18,7 @@ enum testunit_cmds { TU_CMD_READ_BYTES = 1, /* save 0 for ABORT, RESET or similar */ - TU_CMD_HOST_NOTIFY, + TU_CMD_SMBUS_HOST_NOTIFY, TU_CMD_SMBUS_BLOCK_PROC_CALL, TU_NUM_CMDS }; @@ -60,7 +60,7 @@ static void i2c_slave_testunit_work(struct work_struct *work) msg.len = tu->regs[TU_REG_DATAH]; break; - case TU_CMD_HOST_NOTIFY: + case TU_CMD_SMBUS_HOST_NOTIFY: msg.addr = 0x08; msg.flags = 0; msg.len = 3; From 03898691d42e0170e7d00f07cbe21ce0e9f3a8fa Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 8 Aug 2024 10:18:01 +0200 Subject: [PATCH 0656/1052] ALSA: usb-audio: Re-add ScratchAmp quirk entries At the code refactoring of USB-audio quirk handling, I assumed that the quirk entries of Stanton ScratchAmp devices were only about the device name, and moved them completely into the rename table. But it seems that the device requires the quirk entry so that it's probed by the driver itself. This re-adds back the quirk entries of ScratchAmp, but in a minimalistic manner. Fixes: 5436f59bc5bc ("ALSA: usb-audio: Move device rename and profile quirks to an internal table") Link: https://patch.msgid.link/20240808081803.22300-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/quirks-table.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 73abc38a5400..f13a8d63a019 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -2594,6 +2594,10 @@ YAMAHA_DEVICE(0x7010, "UB99"), } }, +/* Stanton ScratchAmp */ +{ USB_DEVICE(0x103d, 0x0100) }, +{ USB_DEVICE(0x103d, 0x0101) }, + /* Novation EMS devices */ { USB_DEVICE_VENDOR_SPEC(0x1235, 0x0001), From 06ce0af34177a110d6a5cf71f924965b9b230691 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 29 May 2024 00:11:23 +0100 Subject: [PATCH 0657/1052] soc: fsl: qbman: remove unused struct 'cgr_comp' 'cgr_comp' has been unused since commit 96f413f47677 ("soc/fsl/qbman: fix issue in qman_delete_cgr_safe()"). Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Sean Anderson Signed-off-by: Michael Ellerman Link: https://msgid.link/20240528231123.136664-1-linux@treblig.org --- drivers/soc/fsl/qbman/qman.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 7e9074519ad2..4dc8aba33d9b 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -2546,11 +2546,6 @@ int qman_delete_cgr(struct qman_cgr *cgr) } EXPORT_SYMBOL(qman_delete_cgr); -struct cgr_comp { - struct qman_cgr *cgr; - struct completion completion; -}; - static void qman_delete_cgr_smp_call(void *p) { qman_delete_cgr((struct qman_cgr *)p); From 78296429e20052b029211b0aca64aadc5052d581 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 30 Jul 2024 19:53:16 +0530 Subject: [PATCH 0658/1052] platform/x86/amd/pmf: Fix to Update HPD Data When ALS is Disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the Ambient Light Sensor (ALS) is disabled, the current code in the PMF driver does not query for Human Presence Detection (HPD) data in amd_pmf_get_sensor_info(). As a result, stale HPD data is used by PMF-TA to evaluate policy conditions, leading to unexpected behavior in the policy output actions. To resolve this issue, modify the PMF driver to query HPD data independently of ALS. Since user_present is a boolean, modify the current code to return true if the user is present and false if the user is away or if the sensor is not detected, and report this status to the PMF TA firmware accordingly. With this change, amd_pmf_get_sensor_info() now returns void instead of int. Fixes: cedecdba60f4 ("platform/x86/amd/pmf: Get ambient light information from AMD SFH driver") Co-developed-by: Patil Rajesh Reddy Signed-off-by: Patil Rajesh Reddy Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20240730142316.3846259-1-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/spc.c | 32 ++++++++++-------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/spc.c b/drivers/platform/x86/amd/pmf/spc.c index a3dec14c3004..3c153fb1425e 100644 --- a/drivers/platform/x86/amd/pmf/spc.c +++ b/drivers/platform/x86/amd/pmf/spc.c @@ -150,36 +150,26 @@ static int amd_pmf_get_slider_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_ return 0; } -static int amd_pmf_get_sensor_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) +static void amd_pmf_get_sensor_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) { struct amd_sfh_info sfh_info; - int ret; + + /* Get the latest information from SFH */ + in->ev_info.user_present = false; /* Get ALS data */ - ret = amd_get_sfh_info(&sfh_info, MT_ALS); - if (!ret) + if (!amd_get_sfh_info(&sfh_info, MT_ALS)) in->ev_info.ambient_light = sfh_info.ambient_light; else - return ret; + dev_dbg(dev->dev, "ALS is not enabled/detected\n"); /* get HPD data */ - ret = amd_get_sfh_info(&sfh_info, MT_HPD); - if (ret) - return ret; - - switch (sfh_info.user_present) { - case SFH_NOT_DETECTED: - in->ev_info.user_present = 0xff; /* assume no sensors connected */ - break; - case SFH_USER_PRESENT: - in->ev_info.user_present = 1; - break; - case SFH_USER_AWAY: - in->ev_info.user_present = 0; - break; + if (!amd_get_sfh_info(&sfh_info, MT_HPD)) { + if (sfh_info.user_present == SFH_USER_PRESENT) + in->ev_info.user_present = true; + } else { + dev_dbg(dev->dev, "HPD is not enabled/detected\n"); } - - return 0; } void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) From 613e3900c24bb1379d994f44d75d31c3223cc263 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 25 Jul 2024 11:21:07 +0200 Subject: [PATCH 0659/1052] platform/x86: ideapad-laptop: introduce a generic notification chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several cases where a notification chain can simplify Lenovo WMI drivers. Add a generic notification chain into ideapad-laptop. Signed-off-by: Gergo Koteles Link: https://lore.kernel.org/r/c5a43efae8a32bd034c3d19c0a686941347575a7.1721898747.git.soyer@irl.hu Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 37 +++++++++++++++++++++++++++ drivers/platform/x86/ideapad-laptop.h | 5 ++++ 2 files changed, 42 insertions(+) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 1ace711f7442..866b32bfe2c9 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -1592,6 +1592,39 @@ static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_ priv->r_touchpad_val = value; } +static int ideapad_laptop_nb_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + switch (action) { + } + + return 0; +} + +static struct notifier_block ideapad_laptop_notifier = { + .notifier_call = ideapad_laptop_nb_notify, +}; + +static BLOCKING_NOTIFIER_HEAD(ideapad_laptop_chain_head); + +int ideapad_laptop_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&ideapad_laptop_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(ideapad_laptop_register_notifier, IDEAPAD_LAPTOP); + +int ideapad_laptop_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&ideapad_laptop_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(ideapad_laptop_unregister_notifier, IDEAPAD_LAPTOP); + +void ideapad_laptop_call_notifier(unsigned long action, void *data) +{ + blocking_notifier_call_chain(&ideapad_laptop_chain_head, action, data); +} +EXPORT_SYMBOL_NS_GPL(ideapad_laptop_call_notifier, IDEAPAD_LAPTOP); + static void ideapad_acpi_notify(acpi_handle handle, u32 event, void *data) { struct ideapad_private *priv = data; @@ -1974,6 +2007,8 @@ static int ideapad_acpi_add(struct platform_device *pdev) if (err) goto shared_init_failed; + ideapad_laptop_register_notifier(&ideapad_laptop_notifier); + return 0; shared_init_failed: @@ -2006,6 +2041,8 @@ static void ideapad_acpi_remove(struct platform_device *pdev) struct ideapad_private *priv = dev_get_drvdata(&pdev->dev); int i; + ideapad_laptop_unregister_notifier(&ideapad_laptop_notifier); + ideapad_shared_exit(priv); acpi_remove_notify_handler(priv->adev->handle, diff --git a/drivers/platform/x86/ideapad-laptop.h b/drivers/platform/x86/ideapad-laptop.h index 4498a96de597..3eb0dcd6bf7b 100644 --- a/drivers/platform/x86/ideapad-laptop.h +++ b/drivers/platform/x86/ideapad-laptop.h @@ -12,6 +12,11 @@ #include #include #include +#include + +int ideapad_laptop_register_notifier(struct notifier_block *nb); +int ideapad_laptop_unregister_notifier(struct notifier_block *nb); +void ideapad_laptop_call_notifier(unsigned long action, void *data); enum { VPCCMD_R_VPC1 = 0x10, From cde7886b35176d56e72bfc68dc104fa08e7b072c Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 25 Jul 2024 11:21:08 +0200 Subject: [PATCH 0660/1052] platform/x86: ideapad-laptop: move ymc_trigger_ec from lenovo-ymc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some models need to trigger the EC after each YMC event for the yoga mode control to work properly. EC triggering consist of a VPC call from the lenovo-ymc module. Except for this, all VPC calls are in the ideapad-laptop module. Since ideapad-laptop has a notification chain, a new YMC_EVENT action can be added and triggered from the lenovo-ymc module. Then the ideapad-laptop can trigger the EC. If the triggering is in the ideapad-laptop module, then the ec_trigger module parameter should be there as well. Move the ymc_trigger_ec functionality and the ec_trigger module parameter to the ideapad-laptop module. Signed-off-by: Gergo Koteles Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/d980ab3ac32b5e554f456b0ff17279bfdbe2a203.1721898747.git.soyer@irl.hu Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/Kconfig | 1 + drivers/platform/x86/ideapad-laptop.c | 49 ++++++++++++++++++++++ drivers/platform/x86/ideapad-laptop.h | 4 ++ drivers/platform/x86/lenovo-ymc.c | 60 +-------------------------- 4 files changed, 56 insertions(+), 58 deletions(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 665fa9524986..ddfccc226751 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -477,6 +477,7 @@ config LENOVO_YMC tristate "Lenovo Yoga Tablet Mode Control" depends on ACPI_WMI depends on INPUT + depends on IDEAPAD_LAPTOP select INPUT_SPARSEKMAP help This driver maps the Tablet Mode Control switch to SW_TABLET_MODE input diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 866b32bfe2c9..9fc1bb990e47 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -146,6 +146,7 @@ struct ideapad_private { bool touchpad_ctrl_via_ec : 1; bool ctrl_ps2_aux_port : 1; bool usb_charging : 1; + bool ymc_ec_trigger : 1; } features; struct { bool initialized; @@ -194,6 +195,12 @@ MODULE_PARM_DESC(touchpad_ctrl_via_ec, "Enable registering a 'touchpad' sysfs-attribute which can be used to manually " "tell the EC to enable/disable the touchpad. This may not work on all models."); +static bool ymc_ec_trigger __read_mostly; +module_param(ymc_ec_trigger, bool, 0444); +MODULE_PARM_DESC(ymc_ec_trigger, + "Enable EC triggering work-around to force emitting tablet mode events. " + "If you need this please report this to: platform-driver-x86@vger.kernel.org"); + /* * shared data */ @@ -1592,10 +1599,50 @@ static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_ priv->r_touchpad_val = value; } +static const struct dmi_system_id ymc_ec_trigger_quirk_dmi_table[] = { + { + /* Lenovo Yoga 7 14ARB7 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "82QF"), + }, + }, + { + /* Lenovo Yoga 7 14ACN6 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "82N7"), + }, + }, + { } +}; + +static void ideapad_laptop_trigger_ec(void) +{ + struct ideapad_private *priv; + int ret; + + guard(mutex)(&ideapad_shared_mutex); + + priv = ideapad_shared; + if (!priv) + return; + + if (!priv->features.ymc_ec_trigger) + return; + + ret = write_ec_cmd(priv->adev->handle, VPCCMD_W_YMC, 1); + if (ret) + dev_warn(&priv->platform_device->dev, "Could not write YMC: %d\n", ret); +} + static int ideapad_laptop_nb_notify(struct notifier_block *nb, unsigned long action, void *data) { switch (action) { + case IDEAPAD_LAPTOP_YMC_EVENT: + ideapad_laptop_trigger_ec(); + break; } return 0; @@ -1761,6 +1808,8 @@ static void ideapad_check_features(struct ideapad_private *priv) priv->features.ctrl_ps2_aux_port = ctrl_ps2_aux_port || dmi_check_system(ctrl_ps2_aux_port_list); priv->features.touchpad_ctrl_via_ec = touchpad_ctrl_via_ec; + priv->features.ymc_ec_trigger = + ymc_ec_trigger || dmi_check_system(ymc_ec_trigger_quirk_dmi_table); if (!read_ec_data(handle, VPCCMD_R_FAN, &val)) priv->features.fan_mode = true; diff --git a/drivers/platform/x86/ideapad-laptop.h b/drivers/platform/x86/ideapad-laptop.h index 3eb0dcd6bf7b..948cc61800a9 100644 --- a/drivers/platform/x86/ideapad-laptop.h +++ b/drivers/platform/x86/ideapad-laptop.h @@ -14,6 +14,10 @@ #include #include +enum ideapad_laptop_notifier_actions { + IDEAPAD_LAPTOP_YMC_EVENT, +}; + int ideapad_laptop_register_notifier(struct notifier_block *nb); int ideapad_laptop_unregister_notifier(struct notifier_block *nb); void ideapad_laptop_call_notifier(unsigned long action, void *data); diff --git a/drivers/platform/x86/lenovo-ymc.c b/drivers/platform/x86/lenovo-ymc.c index e1fbc35504d4..e0bbd6a14a89 100644 --- a/drivers/platform/x86/lenovo-ymc.c +++ b/drivers/platform/x86/lenovo-ymc.c @@ -20,32 +20,10 @@ #define LENOVO_YMC_QUERY_INSTANCE 0 #define LENOVO_YMC_QUERY_METHOD 0x01 -static bool ec_trigger __read_mostly; -module_param(ec_trigger, bool, 0444); -MODULE_PARM_DESC(ec_trigger, "Enable EC triggering work-around to force emitting tablet mode events"); - static bool force; module_param(force, bool, 0444); MODULE_PARM_DESC(force, "Force loading on boards without a convertible DMI chassis-type"); -static const struct dmi_system_id ec_trigger_quirk_dmi_table[] = { - { - /* Lenovo Yoga 7 14ARB7 */ - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_NAME, "82QF"), - }, - }, - { - /* Lenovo Yoga 7 14ACN6 */ - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_NAME, "82N7"), - }, - }, - { } -}; - static const struct dmi_system_id allowed_chasis_types_dmi_table[] = { { .matches = { @@ -62,21 +40,8 @@ static const struct dmi_system_id allowed_chasis_types_dmi_table[] = { struct lenovo_ymc_private { struct input_dev *input_dev; - struct acpi_device *ec_acpi_dev; }; -static void lenovo_ymc_trigger_ec(struct wmi_device *wdev, struct lenovo_ymc_private *priv) -{ - int err; - - if (!priv->ec_acpi_dev) - return; - - err = write_ec_cmd(priv->ec_acpi_dev->handle, VPCCMD_W_YMC, 1); - if (err) - dev_warn(&wdev->dev, "Could not write YMC: %d\n", err); -} - static const struct key_entry lenovo_ymc_keymap[] = { /* Laptop */ { KE_SW, 0x01, { .sw = { SW_TABLET_MODE, 0 } } }, @@ -125,11 +90,9 @@ static void lenovo_ymc_notify(struct wmi_device *wdev, union acpi_object *data) free_obj: kfree(obj); - lenovo_ymc_trigger_ec(wdev, priv); + ideapad_laptop_call_notifier(IDEAPAD_LAPTOP_YMC_EVENT, &code); } -static void acpi_dev_put_helper(void *p) { acpi_dev_put(p); } - static int lenovo_ymc_probe(struct wmi_device *wdev, const void *ctx) { struct lenovo_ymc_private *priv; @@ -143,29 +106,10 @@ static int lenovo_ymc_probe(struct wmi_device *wdev, const void *ctx) return -ENODEV; } - ec_trigger |= dmi_check_system(ec_trigger_quirk_dmi_table); - priv = devm_kzalloc(&wdev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; - if (ec_trigger) { - pr_debug("Lenovo YMC enable EC triggering.\n"); - priv->ec_acpi_dev = acpi_dev_get_first_match_dev("VPC2004", NULL, -1); - - if (!priv->ec_acpi_dev) { - dev_err(&wdev->dev, "Could not find EC ACPI device.\n"); - return -ENODEV; - } - err = devm_add_action_or_reset(&wdev->dev, - acpi_dev_put_helper, priv->ec_acpi_dev); - if (err) { - dev_err(&wdev->dev, - "Could not clean up EC ACPI device: %d\n", err); - return err; - } - } - input_dev = devm_input_allocate_device(&wdev->dev); if (!input_dev) return -ENOMEM; @@ -192,7 +136,6 @@ static int lenovo_ymc_probe(struct wmi_device *wdev, const void *ctx) dev_set_drvdata(&wdev->dev, priv); /* Report the state for the first time on probe */ - lenovo_ymc_trigger_ec(wdev, priv); lenovo_ymc_notify(wdev, NULL); return 0; } @@ -217,3 +160,4 @@ module_wmi_driver(lenovo_ymc_driver); MODULE_AUTHOR("Gergo Koteles "); MODULE_DESCRIPTION("Lenovo Yoga Mode Control driver"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(IDEAPAD_LAPTOP); From 7cc06e729460a209b84d3db4db56c9f85f048cc2 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 25 Jul 2024 11:21:10 +0200 Subject: [PATCH 0661/1052] platform/x86: ideapad-laptop: add a mutex to synchronize VPC commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling VPC commands consists of several VPCW and VPCR ACPI calls. These calls and their results can get mixed up if they are called simultaneously from different threads, like acpi notify handler, sysfs, debugfs, notification chain. The commit e2ffcda16290 ("ACPI: OSL: Allow Notify () handlers to run on all CPUs") made the race issues much worse than before it but some races were possible even before that commit. Add a mutex to synchronize VPC commands. Fixes: e2ffcda16290 ("ACPI: OSL: Allow Notify () handlers to run on all CPUs") Fixes: e82882cdd241 ("platform/x86: Add driver for Yoga Tablet Mode switch") Signed-off-by: Gergo Koteles Link: https://lore.kernel.org/r/f26782fa1194ad11ed5d9ba121a804e59b58b026.1721898747.git.soyer@irl.hu Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 64 ++++++++++++++++++++------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 9fc1bb990e47..98ec30fce9fd 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -126,6 +126,7 @@ struct ideapad_rfk_priv { struct ideapad_private { struct acpi_device *adev; + struct mutex vpc_mutex; /* protects the VPC calls */ struct rfkill *rfk[IDEAPAD_RFKILL_DEV_NUM]; struct ideapad_rfk_priv rfk_priv[IDEAPAD_RFKILL_DEV_NUM]; struct platform_device *platform_device; @@ -301,6 +302,8 @@ static int debugfs_status_show(struct seq_file *s, void *data) struct ideapad_private *priv = s->private; unsigned long value; + guard(mutex)(&priv->vpc_mutex); + if (!read_ec_data(priv->adev->handle, VPCCMD_R_BL_MAX, &value)) seq_printf(s, "Backlight max: %lu\n", value); if (!read_ec_data(priv->adev->handle, VPCCMD_R_BL, &value)) @@ -419,7 +422,8 @@ static ssize_t camera_power_show(struct device *dev, unsigned long result; int err; - err = read_ec_data(priv->adev->handle, VPCCMD_R_CAMERA, &result); + scoped_guard(mutex, &priv->vpc_mutex) + err = read_ec_data(priv->adev->handle, VPCCMD_R_CAMERA, &result); if (err) return err; @@ -438,7 +442,8 @@ static ssize_t camera_power_store(struct device *dev, if (err) return err; - err = write_ec_cmd(priv->adev->handle, VPCCMD_W_CAMERA, state); + scoped_guard(mutex, &priv->vpc_mutex) + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_CAMERA, state); if (err) return err; @@ -491,7 +496,8 @@ static ssize_t fan_mode_show(struct device *dev, unsigned long result; int err; - err = read_ec_data(priv->adev->handle, VPCCMD_R_FAN, &result); + scoped_guard(mutex, &priv->vpc_mutex) + err = read_ec_data(priv->adev->handle, VPCCMD_R_FAN, &result); if (err) return err; @@ -513,7 +519,8 @@ static ssize_t fan_mode_store(struct device *dev, if (state > 4 || state == 3) return -EINVAL; - err = write_ec_cmd(priv->adev->handle, VPCCMD_W_FAN, state); + scoped_guard(mutex, &priv->vpc_mutex) + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_FAN, state); if (err) return err; @@ -598,7 +605,8 @@ static ssize_t touchpad_show(struct device *dev, unsigned long result; int err; - err = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &result); + scoped_guard(mutex, &priv->vpc_mutex) + err = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &result); if (err) return err; @@ -619,7 +627,8 @@ static ssize_t touchpad_store(struct device *dev, if (err) return err; - err = write_ec_cmd(priv->adev->handle, VPCCMD_W_TOUCHPAD, state); + scoped_guard(mutex, &priv->vpc_mutex) + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_TOUCHPAD, state); if (err) return err; @@ -1012,6 +1021,8 @@ static int ideapad_rfk_set(void *data, bool blocked) struct ideapad_rfk_priv *priv = data; int opcode = ideapad_rfk_data[priv->dev].opcode; + guard(mutex)(&priv->priv->vpc_mutex); + return write_ec_cmd(priv->priv->adev->handle, opcode, !blocked); } @@ -1025,6 +1036,8 @@ static void ideapad_sync_rfk_state(struct ideapad_private *priv) int i; if (priv->features.hw_rfkill_switch) { + guard(mutex)(&priv->vpc_mutex); + if (read_ec_data(priv->adev->handle, VPCCMD_R_RF, &hw_blocked)) return; hw_blocked = !hw_blocked; @@ -1198,8 +1211,9 @@ static void ideapad_input_novokey(struct ideapad_private *priv) { unsigned long long_pressed; - if (read_ec_data(priv->adev->handle, VPCCMD_R_NOVO, &long_pressed)) - return; + scoped_guard(mutex, &priv->vpc_mutex) + if (read_ec_data(priv->adev->handle, VPCCMD_R_NOVO, &long_pressed)) + return; if (long_pressed) ideapad_input_report(priv, 17); @@ -1211,8 +1225,9 @@ static void ideapad_check_special_buttons(struct ideapad_private *priv) { unsigned long bit, value; - if (read_ec_data(priv->adev->handle, VPCCMD_R_SPECIAL_BUTTONS, &value)) - return; + scoped_guard(mutex, &priv->vpc_mutex) + if (read_ec_data(priv->adev->handle, VPCCMD_R_SPECIAL_BUTTONS, &value)) + return; for_each_set_bit (bit, &value, 16) { switch (bit) { @@ -1245,6 +1260,8 @@ static int ideapad_backlight_get_brightness(struct backlight_device *blightdev) unsigned long now; int err; + guard(mutex)(&priv->vpc_mutex); + err = read_ec_data(priv->adev->handle, VPCCMD_R_BL, &now); if (err) return err; @@ -1257,6 +1274,8 @@ static int ideapad_backlight_update_status(struct backlight_device *blightdev) struct ideapad_private *priv = bl_get_data(blightdev); int err; + guard(mutex)(&priv->vpc_mutex); + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_BL, blightdev->props.brightness); if (err) @@ -1334,6 +1353,8 @@ static void ideapad_backlight_notify_power(struct ideapad_private *priv) if (!blightdev) return; + guard(mutex)(&priv->vpc_mutex); + if (read_ec_data(priv->adev->handle, VPCCMD_R_BL_POWER, &power)) return; @@ -1346,7 +1367,8 @@ static void ideapad_backlight_notify_brightness(struct ideapad_private *priv) /* if we control brightness via acpi video driver */ if (!priv->blightdev) - read_ec_data(priv->adev->handle, VPCCMD_R_BL, &now); + scoped_guard(mutex, &priv->vpc_mutex) + read_ec_data(priv->adev->handle, VPCCMD_R_BL, &now); else backlight_force_update(priv->blightdev, BACKLIGHT_UPDATE_HOTKEY); } @@ -1571,7 +1593,8 @@ static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_ int ret; /* Without reading from EC touchpad LED doesn't switch state */ - ret = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value); + scoped_guard(mutex, &priv->vpc_mutex) + ret = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value); if (ret) return; @@ -1631,7 +1654,8 @@ static void ideapad_laptop_trigger_ec(void) if (!priv->features.ymc_ec_trigger) return; - ret = write_ec_cmd(priv->adev->handle, VPCCMD_W_YMC, 1); + scoped_guard(mutex, &priv->vpc_mutex) + ret = write_ec_cmd(priv->adev->handle, VPCCMD_W_YMC, 1); if (ret) dev_warn(&priv->platform_device->dev, "Could not write YMC: %d\n", ret); } @@ -1677,11 +1701,13 @@ static void ideapad_acpi_notify(acpi_handle handle, u32 event, void *data) struct ideapad_private *priv = data; unsigned long vpc1, vpc2, bit; - if (read_ec_data(handle, VPCCMD_R_VPC1, &vpc1)) - return; + scoped_guard(mutex, &priv->vpc_mutex) { + if (read_ec_data(handle, VPCCMD_R_VPC1, &vpc1)) + return; - if (read_ec_data(handle, VPCCMD_R_VPC2, &vpc2)) - return; + if (read_ec_data(handle, VPCCMD_R_VPC2, &vpc2)) + return; + } vpc1 = (vpc2 << 8) | vpc1; @@ -1988,6 +2014,10 @@ static int ideapad_acpi_add(struct platform_device *pdev) priv->adev = adev; priv->platform_device = pdev; + err = devm_mutex_init(&pdev->dev, &priv->vpc_mutex); + if (err) + return err; + ideapad_check_features(priv); err = ideapad_sysfs_init(priv); From 919f18f961c03d6694aa726c514184f2311a4614 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 7 Aug 2024 17:02:44 -0700 Subject: [PATCH 0662/1052] x86/mtrr: Check if fixed MTRRs exist before saving them MTRRs have an obsolete fixed variant for fine grained caching control of the 640K-1MB region that uses separate MSRs. This fixed variant has a separate capability bit in the MTRR capability MSR. So far all x86 CPUs which support MTRR have this separate bit set, so it went unnoticed that mtrr_save_state() does not check the capability bit before accessing the fixed MTRR MSRs. Though on a CPU that does not support the fixed MTRR capability this results in a #GP. The #GP itself is harmless because the RDMSR fault is handled gracefully, but results in a WARN_ON(). Add the missing capability check to prevent this. Fixes: 2b1f6278d77c ("[PATCH] x86: Save the MTRRs of the BSP before booting an AP") Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240808000244.946864-1-ak@linux.intel.com --- arch/x86/kernel/cpu/mtrr/mtrr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 767bf1c71aad..2a2fc14955cd 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -609,7 +609,7 @@ void mtrr_save_state(void) { int first_cpu; - if (!mtrr_enabled()) + if (!mtrr_enabled() || !mtrr_state.have_fixed) return; first_cpu = cpumask_first(cpu_online_mask); From 85ba108a529d99c82e814eaf782a9443acf5eaed Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 6 Aug 2024 14:08:41 +0100 Subject: [PATCH 0663/1052] net: stmmac: dwmac4: fix PCS duplex mode decode dwmac4 was decoding the duplex mode from the GMAC_PHYIF_CONTROL_STATUS register incorrectly, using GMAC_PHYIF_CTRLSTATUS_LNKMOD_MASK (value 1) rather than GMAC_PHYIF_CTRLSTATUS_LNKMOD (bit 16). Fix this. Fixes: 70523e639bf8c ("drivers: net: stmmac: reworking the PCS code.") Reviewed-by: Andrew Halaney Reviewed-by: Andrew Lunn Reviewed-by: Serge Semin Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1sbJvd-001rGD-E3@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 2 -- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index d3c5306f1c41..93a78fd0737b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -573,8 +573,6 @@ static inline u32 mtl_low_credx_base_addr(const struct dwmac4_addrs *addrs, #define GMAC_PHYIF_CTRLSTATUS_LNKSTS BIT(19) #define GMAC_PHYIF_CTRLSTATUS_JABTO BIT(20) #define GMAC_PHYIF_CTRLSTATUS_FALSECARDET BIT(21) -/* LNKMOD */ -#define GMAC_PHYIF_CTRLSTATUS_LNKMOD_MASK 0x1 /* LNKSPEED */ #define GMAC_PHYIF_CTRLSTATUS_SPEED_125 0x2 #define GMAC_PHYIF_CTRLSTATUS_SPEED_25 0x1 diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index f98741d2607e..31c387cc5f26 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -786,7 +786,7 @@ static void dwmac4_phystatus(void __iomem *ioaddr, struct stmmac_extra_stats *x) else x->pcs_speed = SPEED_10; - x->pcs_duplex = (status & GMAC_PHYIF_CTRLSTATUS_LNKMOD_MASK); + x->pcs_duplex = (status & GMAC_PHYIF_CTRLSTATUS_LNKMOD); pr_info("Link is Up - %d/%s\n", (int)x->pcs_speed, x->pcs_duplex ? "Full" : "Half"); From 86a41ea9fd79ddb6145cb8ebf5aeafceabca6f7d Mon Sep 17 00:00:00 2001 From: James Chapman Date: Tue, 6 Aug 2024 17:06:26 +0100 Subject: [PATCH 0664/1052] l2tp: fix lockdep splat When l2tp tunnels use a socket provided by userspace, we can hit lockdep splats like the below when data is transmitted through another (unrelated) userspace socket which then gets routed over l2tp. This issue was previously discussed here: https://lore.kernel.org/netdev/87sfialu2n.fsf@cloudflare.com/ The solution is to have lockdep treat socket locks of l2tp tunnel sockets separately than those of standard INET sockets. To do so, use a different lockdep subclass where lock nesting is possible. ============================================ WARNING: possible recursive locking detected 6.10.0+ #34 Not tainted -------------------------------------------- iperf3/771 is trying to acquire lock: ffff8881027601d8 (slock-AF_INET/1){+.-.}-{2:2}, at: l2tp_xmit_skb+0x243/0x9d0 but task is already holding lock: ffff888102650d98 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1848/0x1e10 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(slock-AF_INET/1); lock(slock-AF_INET/1); *** DEADLOCK *** May be due to missing lock nesting notation 10 locks held by iperf3/771: #0: ffff888102650258 (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendmsg+0x1a/0x40 #1: ffffffff822ac220 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x4b/0xbc0 #2: ffffffff822ac220 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x17a/0x1130 #3: ffffffff822ac220 (rcu_read_lock){....}-{1:2}, at: process_backlog+0x28b/0x9f0 #4: ffffffff822ac220 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0xf9/0x260 #5: ffff888102650d98 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1848/0x1e10 #6: ffffffff822ac220 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x4b/0xbc0 #7: ffffffff822ac220 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x17a/0x1130 #8: ffffffff822ac1e0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0xcc/0x1450 #9: ffff888101f33258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock#2){+...}-{2:2}, at: __dev_queue_xmit+0x513/0x1450 stack backtrace: CPU: 2 UID: 0 PID: 771 Comm: iperf3 Not tainted 6.10.0+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0x69/0xa0 dump_stack+0xc/0x20 __lock_acquire+0x135d/0x2600 ? srso_alias_return_thunk+0x5/0xfbef5 lock_acquire+0xc4/0x2a0 ? l2tp_xmit_skb+0x243/0x9d0 ? __skb_checksum+0xa3/0x540 _raw_spin_lock_nested+0x35/0x50 ? l2tp_xmit_skb+0x243/0x9d0 l2tp_xmit_skb+0x243/0x9d0 l2tp_eth_dev_xmit+0x3c/0xc0 dev_hard_start_xmit+0x11e/0x420 sch_direct_xmit+0xc3/0x640 __dev_queue_xmit+0x61c/0x1450 ? ip_finish_output2+0xf4c/0x1130 ip_finish_output2+0x6b6/0x1130 ? srso_alias_return_thunk+0x5/0xfbef5 ? __ip_finish_output+0x217/0x380 ? srso_alias_return_thunk+0x5/0xfbef5 __ip_finish_output+0x217/0x380 ip_output+0x99/0x120 __ip_queue_xmit+0xae4/0xbc0 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? tcp_options_write.constprop.0+0xcb/0x3e0 ip_queue_xmit+0x34/0x40 __tcp_transmit_skb+0x1625/0x1890 __tcp_send_ack+0x1b8/0x340 tcp_send_ack+0x23/0x30 __tcp_ack_snd_check+0xa8/0x530 ? srso_alias_return_thunk+0x5/0xfbef5 tcp_rcv_established+0x412/0xd70 tcp_v4_do_rcv+0x299/0x420 tcp_v4_rcv+0x1991/0x1e10 ip_protocol_deliver_rcu+0x50/0x220 ip_local_deliver_finish+0x158/0x260 ip_local_deliver+0xc8/0xe0 ip_rcv+0xe5/0x1d0 ? __pfx_ip_rcv+0x10/0x10 __netif_receive_skb_one_core+0xce/0xe0 ? process_backlog+0x28b/0x9f0 __netif_receive_skb+0x34/0xd0 ? process_backlog+0x28b/0x9f0 process_backlog+0x2cb/0x9f0 __napi_poll.constprop.0+0x61/0x280 net_rx_action+0x332/0x670 ? srso_alias_return_thunk+0x5/0xfbef5 ? find_held_lock+0x2b/0x80 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 handle_softirqs+0xda/0x480 ? __dev_queue_xmit+0xa2c/0x1450 do_softirq+0xa1/0xd0 __local_bh_enable_ip+0xc8/0xe0 ? __dev_queue_xmit+0xa2c/0x1450 __dev_queue_xmit+0xa48/0x1450 ? ip_finish_output2+0xf4c/0x1130 ip_finish_output2+0x6b6/0x1130 ? srso_alias_return_thunk+0x5/0xfbef5 ? __ip_finish_output+0x217/0x380 ? srso_alias_return_thunk+0x5/0xfbef5 __ip_finish_output+0x217/0x380 ip_output+0x99/0x120 __ip_queue_xmit+0xae4/0xbc0 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? tcp_options_write.constprop.0+0xcb/0x3e0 ip_queue_xmit+0x34/0x40 __tcp_transmit_skb+0x1625/0x1890 tcp_write_xmit+0x766/0x2fb0 ? __entry_text_end+0x102ba9/0x102bad ? srso_alias_return_thunk+0x5/0xfbef5 ? __might_fault+0x74/0xc0 ? srso_alias_return_thunk+0x5/0xfbef5 __tcp_push_pending_frames+0x56/0x190 tcp_push+0x117/0x310 tcp_sendmsg_locked+0x14c1/0x1740 tcp_sendmsg+0x28/0x40 inet_sendmsg+0x5d/0x90 sock_write_iter+0x242/0x2b0 vfs_write+0x68d/0x800 ? __pfx_sock_write_iter+0x10/0x10 ksys_write+0xc8/0xf0 __x64_sys_write+0x3d/0x50 x64_sys_call+0xfaf/0x1f50 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f4d143af992 Code: c3 8b 07 85 c0 75 24 49 89 fb 48 89 f0 48 89 d7 48 89 ce 4c 89 c2 4d 89 ca 4c 8b 44 24 08 4c 8b 4c 24 10 4c 89 5c 24 08 0f 05 e9 01 cc ff ff 41 54 b8 02 00 00 0 RSP: 002b:00007ffd65032058 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f4d143af992 RDX: 0000000000000025 RSI: 00007f4d143f3bcc RDI: 0000000000000005 RBP: 00007f4d143f2b28 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f4d143f3bcc R13: 0000000000000005 R14: 0000000000000000 R15: 00007ffd650323f0 Fixes: 0b2c59720e65 ("l2tp: close all race conditions in l2tp_tunnel_register()") Suggested-by: Eric Dumazet Reported-by: syzbot+6acef9e0a4d1f46c83d4@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6acef9e0a4d1f46c83d4 CC: gnault@redhat.com CC: cong.wang@bytedance.com Signed-off-by: James Chapman Signed-off-by: Tom Parkin Link: https://patch.msgid.link/20240806160626.1248317-1-jchapman@katalix.com Signed-off-by: Jakub Kicinski --- net/l2tp/l2tp_core.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index c80ab3f26084..2e86f520f799 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -86,6 +86,11 @@ /* Default trace flags */ #define L2TP_DEFAULT_DEBUG_FLAGS 0 +#define L2TP_DEPTH_NESTING 2 +#if L2TP_DEPTH_NESTING == SINGLE_DEPTH_NESTING +#error "L2TP requires its own lockdep subclass" +#endif + /* Private data stored for received packets in the skb. */ struct l2tp_skb_cb { @@ -1124,7 +1129,13 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); nf_reset_ct(skb); - bh_lock_sock_nested(sk); + /* L2TP uses its own lockdep subclass to avoid lockdep splats caused by + * nested socket calls on the same lockdep socket class. This can + * happen when data from a user socket is routed over l2tp, which uses + * another userspace socket. + */ + spin_lock_nested(&sk->sk_lock.slock, L2TP_DEPTH_NESTING); + if (sock_owned_by_user(sk)) { kfree_skb(skb); ret = NET_XMIT_DROP; @@ -1176,7 +1187,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl); out_unlock: - bh_unlock_sock(sk); + spin_unlock(&sk->sk_lock.slock); return ret; } From 9ee09edc05f20422e7ced84b1f8a5d3359926ac8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 6 Aug 2024 10:56:59 -0700 Subject: [PATCH 0665/1052] net: bcmgenet: Properly overlay PHY and MAC Wake-on-LAN capabilities Some Wake-on-LAN modes such as WAKE_FILTER may only be supported by the MAC, while others might be only supported by the PHY. Make sure that the .get_wol() returns the union of both rather than only that of the PHY if the PHY supports Wake-on-LAN. Fixes: 7e400ff35cbe ("net: bcmgenet: Add support for PHY-based Wake-on-LAN") Signed-off-by: Florian Fainelli Link: https://patch.msgid.link/20240806175659.3232204-1-florian.fainelli@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index 1248792d7fd4..0715ea5bf13e 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -42,19 +42,15 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) struct bcmgenet_priv *priv = netdev_priv(dev); struct device *kdev = &priv->pdev->dev; - if (dev->phydev) { + if (dev->phydev) phy_ethtool_get_wol(dev->phydev, wol); - if (wol->supported) - return; - } - if (!device_can_wakeup(kdev)) { - wol->supported = 0; - wol->wolopts = 0; + /* MAC is not wake-up capable, return what the PHY does */ + if (!device_can_wakeup(kdev)) return; - } - wol->supported = WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER; + /* Overlay MAC capabilities with that of the PHY queried before */ + wol->supported |= WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER; wol->wolopts = priv->wolopts; memset(wol->sopass, 0, sizeof(wol->sopass)); From 8fee6d5ad5fa18c270eedb2a2cdf58dbadefb94b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cs=C3=B3k=C3=A1s=2C=20Bence?= Date: Wed, 7 Aug 2024 10:09:56 +0200 Subject: [PATCH 0666/1052] net: fec: Stop PPS on driver remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PPS was not stopped in `fec_ptp_stop()`, called when the adapter was removed. Consequentially, you couldn't safely reload the driver with the PPS signal on. Fixes: 32cba57ba74b ("net: fec: introduce fec_ptp_stop and use in probe fail path") Reviewed-by: Fabio Estevam Link: https://lore.kernel.org/netdev/CAOMZO5BzcZR8PwKKwBssQq_wAGzVgf1ffwe_nhpQJjviTdxy-w@mail.gmail.com/T/#m01dcb810bfc451a492140f6797ca77443d0cb79f Signed-off-by: Csókás, Bence Reviewed-by: Andrew Lunn Reviewed-by: Frank Li Link: https://patch.msgid.link/20240807080956.2556602-1-csokas.bence@prolan.hu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_ptp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index e32f6724f568..2e4f3e1782a2 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -775,6 +775,9 @@ void fec_ptp_stop(struct platform_device *pdev) struct net_device *ndev = platform_get_drvdata(pdev); struct fec_enet_private *fep = netdev_priv(ndev); + if (fep->pps_enable) + fec_ptp_enable_pps(fep, 0); + cancel_delayed_work_sync(&fep->time_keep); hrtimer_cancel(&fep->perout_timer); if (fep->ptp_clock) From a70b637db15b4de25af3c5946c4399144b3bc241 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 7 Aug 2024 09:54:22 +0200 Subject: [PATCH 0667/1052] net: pse-pd: tps23881: include missing bitfield.h header Using FIELD_GET() fails in configurations that don't already include the header file indirectly: drivers/net/pse-pd/tps23881.c: In function 'tps23881_i2c_probe': drivers/net/pse-pd/tps23881.c:755:13: error: implicit declaration of function 'FIELD_GET' [-Wimplicit-function-declaration] 755 | if (FIELD_GET(TPS23881_REG_DEVID_MASK, ret) != TPS23881_DEVICE_ID) { | ^~~~~~~~~ Fixes: 89108cb5c285 ("net: pse-pd: tps23881: Fix the device ID check") Signed-off-by: Arnd Bergmann Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20240807075455.2055224-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/tps23881.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/pse-pd/tps23881.c b/drivers/net/pse-pd/tps23881.c index f90db758554b..2ea75686a319 100644 --- a/drivers/net/pse-pd/tps23881.c +++ b/drivers/net/pse-pd/tps23881.c @@ -5,6 +5,7 @@ * Copyright (c) 2023 Bootlin, Kory Maincent */ +#include #include #include #include From b54de55990b0467538c6bb33523b28816384958a Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 7 Aug 2024 17:06:12 +0100 Subject: [PATCH 0668/1052] net: ethtool: fix off-by-one error in max RSS context IDs Both ethtool_ops.rxfh_max_context_id and the default value used when it's not specified are supposed to be exclusive maxima (the former is documented as such; the latter, U32_MAX, cannot be used as an ID since it equals ETH_RXFH_CONTEXT_ALLOC), but xa_alloc() expects an inclusive maximum. Subtract one from 'limit' to produce an inclusive maximum, and pass that to xa_alloc(). Increase bnxt's max by one to prevent a (very minor) regression, as BNXT_MAX_ETH_RSS_CTX is an inclusive max. This is safe since bnxt is not actually hard-limited; BNXT_MAX_ETH_RSS_CTX is just a leftover from old driver code that managed context IDs itself. Rename rxfh_max_context_id to rxfh_max_num_contexts to make its semantics (hopefully) more obvious. Fixes: 847a8ab18676 ("net: ethtool: let the core choose RSS context IDs") Signed-off-by: Edward Cree Link: https://patch.msgid.link/5a2d11a599aa5b0cc6141072c01accfb7758650c.1723045898.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- include/linux/ethtool.h | 10 +++++----- net/ethtool/ioctl.c | 5 +++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index ab8e3f197e7b..9dadc89378f0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -5290,7 +5290,7 @@ void bnxt_ethtool_free(struct bnxt *bp) const struct ethtool_ops bnxt_ethtool_ops = { .cap_link_lanes_supported = 1, .cap_rss_ctx_supported = 1, - .rxfh_max_context_id = BNXT_MAX_ETH_RSS_CTX, + .rxfh_max_num_contexts = BNXT_MAX_ETH_RSS_CTX + 1, .rxfh_indir_space = BNXT_MAX_RSS_TABLE_ENTRIES_P5, .rxfh_priv_size = sizeof(struct bnxt_rss_ctx), .supported_coalesce_params = ETHTOOL_COALESCE_USECS | diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 303fda54ef17..989c94eddb2b 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -736,10 +736,10 @@ struct kernel_ethtool_ts_info { * @rxfh_key_space: same as @rxfh_indir_space, but for the key. * @rxfh_priv_size: size of the driver private data area the core should * allocate for an RSS context (in &struct ethtool_rxfh_context). - * @rxfh_max_context_id: maximum (exclusive) supported RSS context ID. If this - * is zero then the core may choose any (nonzero) ID, otherwise the core - * will only use IDs strictly less than this value, as the @rss_context - * argument to @create_rxfh_context and friends. + * @rxfh_max_num_contexts: maximum (exclusive) supported RSS context ID. + * If this is zero then the core may choose any (nonzero) ID, otherwise + * the core will only use IDs strictly less than this value, as the + * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -954,7 +954,7 @@ struct ethtool_ops { u32 rxfh_indir_space; u16 rxfh_key_space; u16 rxfh_priv_size; - u32 rxfh_max_context_id; + u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 8ca13208d240..a8e276ecf723 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1449,12 +1449,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } if (ops->create_rxfh_context) { - u32 limit = ops->rxfh_max_context_id ?: U32_MAX; + u32 limit = ops->rxfh_max_num_contexts ?: U32_MAX; u32 ctx_id; /* driver uses new API, core allocates ID */ ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx, - XA_LIMIT(1, limit), GFP_KERNEL_ACCOUNT); + XA_LIMIT(1, limit - 1), + GFP_KERNEL_ACCOUNT); if (ret < 0) { kfree(ctx); goto out; From 4d7c3c1aba3ca12fad2e90163b8d5153363f93e5 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 7 Aug 2024 20:33:52 +0300 Subject: [PATCH 0669/1052] ethtool: Fix context creation with no parameters The 'at least one change' requirement is not applicable for context creation, skip the check in such case. This allows a command such as 'ethtool -X eth0 context new' to work. The command works by mistake when using older versions of userspace ethtool due to an incompatibility issue where rxfh.input_xfrm is passed as zero (unset) instead of RXH_XFRM_NO_CHANGE as done with recent userspace. This patch does not try to solve the incompatibility issue. Link: https://lore.kernel.org/netdev/05ae8316-d3aa-4356-98c6-55ed4253c8a7@nvidia.com/ Fixes: 84a1d9c48200 ("net: ethtool: extend RXNFC API to support RSS spreading of filter matches") Reviewed-by: Dragos Tatulea Reviewed-by: Jianbo Liu Signed-off-by: Gal Pressman Reviewed-by: Edward Cree Link: https://patch.msgid.link/20240807173352.3501746-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index a8e276ecf723..e18823bf2330 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1369,14 +1369,17 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; create = rxfh.rss_context == ETH_RXFH_CONTEXT_ALLOC; - /* If either indir, hash key or function is valid, proceed further. - * Must request at least one change: indir size, hash key, function - * or input transformation. - */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || - (rxfh.key_size && (rxfh.key_size != dev_key_size)) || + (rxfh.key_size && rxfh.key_size != dev_key_size)) + return -EINVAL; + + /* Must request at least one change: indir size, hash key, function + * or input transformation. + * There's no need for any of it in case of context creation. + */ + if (!create && (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE && rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) From 0411f73c13afcf619d7aa7546edbc5710a871cae Mon Sep 17 00:00:00 2001 From: Martin Whitaker Date: Wed, 7 Aug 2024 21:52:09 +0100 Subject: [PATCH 0670/1052] net: dsa: microchip: disable EEE for KSZ8567/KSZ9567/KSZ9896/KSZ9897. As noted in the device errata [1-8], EEE support is not fully operational in the KSZ8567, KSZ9477, KSZ9567, KSZ9896, and KSZ9897 devices, causing link drops when connected to another device that supports EEE. The patch series "net: add EEE support for KSZ9477 switch family" merged in commit 9b0bf4f77162 caused EEE support to be enabled in these devices. A fix for this regression for the KSZ9477 alone was merged in commit 08c6d8bae48c2. This patch extends this fix to the other affected devices. [1] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ8567R-Errata-DS80000752.pdf [2] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ8567S-Errata-DS80000753.pdf [3] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ9477S-Errata-DS80000754.pdf [4] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ9567R-Errata-DS80000755.pdf [5] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ9567S-Errata-DS80000756.pdf [6] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ9896C-Errata-DS80000757.pdf [7] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ9897R-Errata-DS80000758.pdf [8] https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ9897S-Errata-DS80000759.pdf Fixes: 69d3b36ca045 ("net: dsa: microchip: enable EEE support") # for KSZ8567/KSZ9567/KSZ9896/KSZ9897 Link: https://lore.kernel.org/netdev/137ce1ee-0b68-4c96-a717-c8164b514eec@martin-whitaker.me.uk/ Signed-off-by: Martin Whitaker Acked-by: Arun Ramadoss Reviewed-by: Oleksij Rempel Reviewed-by: Lukasz Majewski Link: https://patch.msgid.link/20240807205209.21464-1-foss@martin-whitaker.me.uk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index b120e66d5669..1491099528be 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2578,7 +2578,11 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port) if (!port) return MICREL_KSZ8_P1_ERRATA; break; + case KSZ8567_CHIP_ID: case KSZ9477_CHIP_ID: + case KSZ9567_CHIP_ID: + case KSZ9896_CHIP_ID: + case KSZ9897_CHIP_ID: /* KSZ9477 Errata DS80000754C * * Module 4: Energy Efficient Ethernet (EEE) feature select must @@ -2588,6 +2592,13 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port) * controls. If not disabled, the PHY ports can auto-negotiate * to enable EEE, and this feature can cause link drops when * linked to another device supporting EEE. + * + * The same item appears in the errata for the KSZ9567, KSZ9896, + * and KSZ9897. + * + * A similar item appears in the errata for the KSZ8567, but + * provides an alternative workaround. For now, use the simple + * workaround of disabling the EEE feature for this device too. */ return MICREL_NO_EEE; } From 1b5487aefb1ce7a6b1f15a33297d1231306b4122 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 31 Jul 2024 21:38:50 -0500 Subject: [PATCH 0671/1052] smb3: fix setting SecurityFlags when encryption is required Setting encryption as required in security flags was broken. For example (to require all mounts to be encrypted by setting): "echo 0x400c5 > /proc/fs/cifs/SecurityFlags" Would return "Invalid argument" and log "Unsupported security flags" This patch fixes that (e.g. allowing overriding the default for SecurityFlags 0x00c5, including 0x40000 to require seal, ie SMB3.1.1 encryption) so now that works and forces encryption on subsequent mounts. Acked-by: Bharath SM Cc: stable@vger.kernel.org Signed-off-by: Steve French --- Documentation/admin-guide/cifs/usage.rst | 2 +- fs/smb/client/cifs_debug.c | 2 +- fs/smb/client/cifsglob.h | 8 ++++---- fs/smb/client/smb2pdu.c | 3 +++ 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst index fd4b56c0996f..c09674a75a9e 100644 --- a/Documentation/admin-guide/cifs/usage.rst +++ b/Documentation/admin-guide/cifs/usage.rst @@ -742,7 +742,7 @@ SecurityFlags Flags which control security negotiation and may use NTLMSSP 0x00080 must use NTLMSSP 0x80080 seal (packet encryption) 0x00040 - must seal (not implemented yet) 0x40040 + must seal 0x40040 cifsFYI If set to non-zero value, additional debug information will be logged to the system error log. This field diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index c71ae5c04306..4a20e92474b2 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -1072,7 +1072,7 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) static void cifs_security_flags_handle_must_flags(unsigned int *flags) { - unsigned int signflags = *flags & CIFSSEC_MUST_SIGN; + unsigned int signflags = *flags & (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL); if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) *flags = CIFSSEC_MUST_KRB5; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f6d1f075987f..b9f46d29a441 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1881,7 +1881,7 @@ static inline bool is_replayable_error(int error) #define CIFSSEC_MAY_SIGN 0x00001 #define CIFSSEC_MAY_NTLMV2 0x00004 #define CIFSSEC_MAY_KRB5 0x00008 -#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ +#define CIFSSEC_MAY_SEAL 0x00040 #define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */ #define CIFSSEC_MUST_SIGN 0x01001 @@ -1891,11 +1891,11 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_NTLMV2 0x04004 #define CIFSSEC_MUST_KRB5 0x08008 #ifdef CONFIG_CIFS_UPCALL -#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */ +#define CIFSSEC_MASK 0xCF0CF /* flags supported if no weak allowed */ #else -#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */ +#define CIFSSEC_MASK 0xC70C7 /* flags supported if no weak allowed */ #endif /* UPCALL */ -#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ +#define CIFSSEC_MUST_SEAL 0x40040 #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ #define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP | CIFSSEC_MAY_SEAL) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9a06b5594669..83facb54276a 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -82,6 +82,9 @@ int smb3_encryption_required(const struct cifs_tcon *tcon) if (tcon->seal && (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) return 1; + if (((global_secflags & CIFSSEC_MUST_SEAL) == CIFSSEC_MUST_SEAL) && + (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + return 1; return 0; } From bdcffe4be7cb90ccd12c49924dad9e2eda11b57a Mon Sep 17 00:00:00 2001 From: Xiaxi Shen Date: Wed, 7 Aug 2024 09:53:20 -0700 Subject: [PATCH 0672/1052] Fix spelling errors in Server Message Block Fixed typos in various files under fs/smb/client/ Signed-off-by: Xiaxi Shen Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 4 ++-- fs/smb/client/misc.c | 2 +- fs/smb/client/smbdirect.c | 8 ++++---- fs/smb/client/transport.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index b9f46d29a441..5c9b3e6cd95f 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -345,7 +345,7 @@ struct smb_version_operations { /* connect to a server share */ int (*tree_connect)(const unsigned int, struct cifs_ses *, const char *, struct cifs_tcon *, const struct nls_table *); - /* close tree connecion */ + /* close tree connection */ int (*tree_disconnect)(const unsigned int, struct cifs_tcon *); /* get DFS referrals */ int (*get_dfs_refer)(const unsigned int, struct cifs_ses *, @@ -816,7 +816,7 @@ struct TCP_Server_Info { * Protected by @refpath_lock and @srv_lock. The @refpath_lock is * mostly used for not requiring a copy of @leaf_fullpath when getting * cached or new DFS referrals (which might also sleep during I/O). - * While @srv_lock is held for making string and NULL comparions against + * While @srv_lock is held for making string and NULL comparisons against * both fields as in mount(2) and cache refresh. * * format: \\HOST\SHARE[\OPTIONAL PATH] diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index b28ff62f1f15..3fe5bfc389d0 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -352,7 +352,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) * on simple responses (wct, bcc both zero) * in particular have seen this on * ulogoffX and FindClose. This leaves - * one byte of bcc potentially unitialized + * one byte of bcc potentially uninitialized */ /* zero rest of bcc */ tmp[sizeof(struct smb_hdr)+1] = 0; diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index d74e829de51c..7bcc379014ca 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -406,7 +406,7 @@ static void smbd_post_send_credits(struct work_struct *work) else response = get_empty_queue_buffer(info); if (!response) { - /* now switch to emtpy packet queue */ + /* now switch to empty packet queue */ if (use_receive_queue) { use_receive_queue = 0; continue; @@ -618,7 +618,7 @@ static struct rdma_cm_id *smbd_create_id( /* * Test if FRWR (Fast Registration Work Requests) is supported on the device - * This implementation requries FRWR on RDMA read/write + * This implementation requires FRWR on RDMA read/write * return value: true if it is supported */ static bool frwr_is_supported(struct ib_device_attr *attrs) @@ -2177,7 +2177,7 @@ static int allocate_mr_list(struct smbd_connection *info) * MR available in the list. It may access the list while the * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock * as they never modify the same places. However, there may be several CPUs - * issueing I/O trying to get MR at the same time, mr_list_lock is used to + * issuing I/O trying to get MR at the same time, mr_list_lock is used to * protect this situation. */ static struct smbd_mr *get_mr(struct smbd_connection *info) @@ -2311,7 +2311,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, /* * There is no need for waiting for complemtion on ib_post_send * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution - * on the next ib_post_send when we actaully send I/O to remote peer + * on the next ib_post_send when we actually send I/O to remote peer */ rc = ib_post_send(info->id->qp, ®_wr->wr, NULL); if (!rc) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index adfe0d058701..6e68aaf5bd20 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -1289,7 +1289,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, out: /* * This will dequeue all mids. After this it is important that the - * demultiplex_thread will not process any of these mids any futher. + * demultiplex_thread will not process any of these mids any further. * This is prevented above by using a noop callback that will not * wake this thread except for the very last PDU. */ From 9eb18136af9fe4dd688724070f2bfba271bd1542 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Aug 2024 10:15:46 +0100 Subject: [PATCH 0673/1052] KVM: arm64: vgic: Hold config_lock while tearing down a CPU interface Tearing down a vcpu CPU interface involves freeing the private interrupt array. If we don't hold the lock, we may race against another thread trying to configure it. Yeah, fuzzers do wonderful things... Taking the lock early solves this particular problem. Fixes: 03b3d00a70b5 ("KVM: arm64: vgic: Allocate private interrupts on demand") Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240808091546.3262111-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 7f68cf58b978..41feb858ff9a 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -438,14 +438,13 @@ void kvm_vgic_destroy(struct kvm *kvm) unsigned long i; mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); vgic_debug_destroy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) __kvm_vgic_vcpu_destroy(vcpu); - mutex_lock(&kvm->arch.config_lock); - kvm_vgic_dist_destroy(kvm); mutex_unlock(&kvm->arch.config_lock); From 5819e464a17587e6830cfab05f3e91a9a8753a41 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 8 Aug 2024 14:08:08 +1000 Subject: [PATCH 0674/1052] cpumask: Fix crash on updating CPU enabled mask The CPU enabled mask instead of the CPU possible mask should be used by set_cpu_enabled(). Otherwise, we run into crash due to write to the read-only CPU possible mask when vCPU is hot added on ARM64. (qemu) device_add host-arm-cpu,id=cpu1,socket-id=1 Unable to handle kernel write to read-only memory at virtual address ffff800080fa7190 : Call trace: register_cpu+0x1a4/0x2e8 arch_register_cpu+0x84/0xd8 acpi_processor_add+0x480/0x5b0 acpi_bus_attach+0x1c4/0x300 acpi_dev_for_one_check+0x3c/0x50 device_for_each_child+0x68/0xc8 acpi_dev_for_each_child+0x48/0x80 acpi_bus_attach+0x84/0x300 acpi_bus_scan+0x74/0x220 acpi_scan_rescan_bus+0x54/0x88 acpi_device_hotplug+0x208/0x478 acpi_hotplug_work_fn+0x2c/0x50 process_one_work+0x15c/0x3c0 worker_thread+0x2ec/0x400 kthread+0x120/0x130 ret_from_fork+0x10/0x20 Fix it by passing the CPU enabled mask instead of the CPU possible mask to set_cpu_enabled(). Fixes: 51c4767503d5 ("Merge tag 'bitmap-6.11-rc1' of https://github.com:/norov/linux") Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Signed-off-by: Yury Norov --- include/linux/cpumask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 801a7e524113..53158de44b83 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1037,7 +1037,7 @@ void init_cpu_online(const struct cpumask *src); assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) -#define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_possible_mask, (enabled)) +#define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) From ae02c7b7fea3e034fbd724c21d88406f71ccc2f8 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 25 Jul 2024 23:43:35 -0700 Subject: [PATCH 0675/1052] drm/xe/rtp: Fix off-by-one when processing rules Gustavo noticed an odd "+ 2" in rtp_mark_active() while processing rtp rules and pointed that it should be "+ 1". In fact, while processing entries without actions (OOB workarounds), if the WA is activated and has OR rules, it will also inadvertently activate the very next workaround. Test in a LNL B0 platform by moving 18024947630 on top of 16020292621, makes the latter become active: $ cat /sys/kernel/debug/dri/0/gt0/workarounds ... OOB Workarounds 18024947630 16020292621 14018094691 16022287689 13011645652 22019338487_display In future a kunit test will be added to cover the rtp checks for entries without actions. Fixes: fe19328b900c ("drm/xe/rtp: Add support for entries with no action") Cc: Gustavo Sousa Reviewed-by: Gustavo Sousa Link: https://patchwork.freedesktop.org/patch/msgid/20240726064337.797576-6-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit fd6797ec50c561f085bc94e3ee26f484a52af79e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_rtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_rtp.c b/drivers/gpu/drm/xe/xe_rtp.c index 02e28274282f..5efe83cc82ab 100644 --- a/drivers/gpu/drm/xe/xe_rtp.c +++ b/drivers/gpu/drm/xe/xe_rtp.c @@ -231,7 +231,7 @@ static void rtp_mark_active(struct xe_device *xe, if (first == last) bitmap_set(ctx->active_entries, first, 1); else - bitmap_set(ctx->active_entries, first, last - first + 2); + bitmap_set(ctx->active_entries, first, last - first + 1); } /** From 4f854a8b1b85d46abd5ce206936d23f87ac5e0c9 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 26 Jul 2024 18:22:16 -0700 Subject: [PATCH 0676/1052] drm/xe: Use dma_fence_chain_free in chain fence unused as a sync A chain fence is uninitialized if not installed in a drm sync obj. Thus if xe_sync_entry_cleanup is called and sync->chain_fence is non-NULL the proper cleanup is dma_fence_chain_free rather than a dma-fence put. Reported-by: Paulo Zanoni Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2411 Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2261 Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20240727012216.2118276-1-matthew.brost@intel.com (cherry picked from commit 7f7a2da3bf8bc0e0f6c239af495b7050056e889c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index 2883d9aca404..c4e018aa2982 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -263,7 +263,7 @@ void xe_sync_entry_cleanup(struct xe_sync_entry *sync) if (sync->fence) dma_fence_put(sync->fence); if (sync->chain_fence) - dma_fence_put(&sync->chain_fence->base); + dma_fence_chain_free(sync->chain_fence); if (sync->ufence) user_fence_put(sync->ufence); } From ac3191c5cf47e2d5220a1ed7353a2e498a1f415e Mon Sep 17 00:00:00 2001 From: Karthik Poosa Date: Thu, 1 Aug 2024 16:54:24 +0530 Subject: [PATCH 0677/1052] drm/xe/hwmon: Fix PL1 disable flow in xe_hwmon_power_max_write In xe_hwmon_power_max_write, for PL1 disable supported case, instead of returning after PL1 disable, PL1 enable path was also being run. Fixed it by returning after disable. v2: Correct typo and grammar in commit message. (Jonathan) Signed-off-by: Karthik Poosa Fixes: fef6dd12b45a ("drm/xe/hwmon: Protect hwmon rw attributes with hwmon_lock") Reviewed-by: Jonathan Cavitt Signed-off-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240801112424.1841766-1-karthik.poosa@intel.com (cherry picked from commit 146458645e505f5eac498759bcd865cf7c0dfd9a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_hwmon.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index 0c8ce09e5025..832ea81faeee 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -203,9 +203,10 @@ static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, int channel, long va reg_val = xe_mmio_rmw32(hwmon->gt, rapl_limit, PKG_PWR_LIM_1_EN, 0); reg_val = xe_mmio_read32(hwmon->gt, rapl_limit); if (reg_val & PKG_PWR_LIM_1_EN) { + drm_warn(>_to_xe(hwmon->gt)->drm, "PL1 disable is not supported!\n"); ret = -EOPNOTSUPP; - goto unlock; } + goto unlock; } /* Computation in 64-bits to avoid overflow. Round to nearest. */ From 642dfc9d5964b26f66fa6c28ce2861e11f9232aa Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 1 Aug 2024 08:41:16 -0700 Subject: [PATCH 0678/1052] drm/xe: Take ref to VM in delayed snapshot Kernel BO's don't take a ref to the VM, we need the VM for the delayed snapshot, so take a ref to the VM in delayed snapshot. v2: - Check for lrc_bo before taking a VM ref (CI) - Check lrc_bo->vm before taking / dropping a VM ref (CI) - Drop VM in xe_lrc_snapshot_free v5: - Fix commit message wording (Johnathan) Fixes: 47058633d9c5 ("drm/xe: Move lrc snapshot capturing to xe_lrc.c") Cc: Maarten Lankhorst Signed-off-by: Matthew Brost Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240801154118.2547543-2-matthew.brost@intel.com (cherry picked from commit c3bc97d2f102ddd5a8341eeb2dbae2a3e98bb46a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_lrc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 94ff62e1d95e..58121821f081 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -1634,6 +1634,9 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) if (!snapshot) return NULL; + if (lrc->bo && lrc->bo->vm) + xe_vm_get(lrc->bo->vm); + snapshot->context_desc = xe_lrc_ggtt_addr(lrc); snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); snapshot->head = xe_lrc_ring_head(lrc); @@ -1653,12 +1656,14 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) { struct xe_bo *bo; + struct xe_vm *vm; struct iosys_map src; if (!snapshot) return; bo = snapshot->lrc_bo; + vm = bo->vm; snapshot->lrc_bo = NULL; snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); @@ -1678,6 +1683,8 @@ void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) xe_bo_unlock(bo); put_bo: xe_bo_put(bo); + if (vm) + xe_vm_put(vm); } void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) @@ -1727,8 +1734,14 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) return; kvfree(snapshot->lrc_snapshot); - if (snapshot->lrc_bo) + if (snapshot->lrc_bo) { + struct xe_vm *vm; + + vm = snapshot->lrc_bo->vm; xe_bo_put(snapshot->lrc_bo); + if (vm) + xe_vm_put(vm); + } kfree(snapshot); } From cd9aae921ab6b614e56ce690dedfe82e79db9354 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 8 Aug 2024 11:44:07 -0700 Subject: [PATCH 0679/1052] dt-bindings: display: panel: samsung,atna45dc02: Fix indentation The yaml had indentation errors: ./Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml:21:9: [warning] wrong indentation: expected 10 but found 8 (indentation) ./Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml:23:11: [warning] wrong indentation: expected 12 but found 10 (indentation) Fix them. Reported-by: Rob Herring Closes: https://lore.kernel.org/r/CAL_JsqLRTgQRPcfXy4G9hLoHMd-Uax4_C90BV_OZn4mK+-82kw@mail.gmail.com Fixes: 1c4a057d01f4 ("dt-bindings: display: panel: samsung,atna45dc02: Document ATNA45DC02") Reviewed-by: Rob Clark Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20240808114407.1.I099e8e9e36407a0785d846b953031d40ea71e559@changeid --- .../bindings/display/panel/samsung,atna33xc20.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml index 87c601bcf20a..032f783eefc4 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml @@ -18,12 +18,12 @@ properties: # Samsung 13.3" FHD (1920x1080 pixels) eDP AMOLED panel - const: samsung,atna33xc20 - items: - - enum: - # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel - - samsung,atna45af01 - # Samsung 14.5" 3K (2944x1840 pixels) eDP AMOLED panel - - samsung,atna45dc02 - - const: samsung,atna33xc20 + - enum: + # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel + - samsung,atna45af01 + # Samsung 14.5" 3K (2944x1840 pixels) eDP AMOLED panel + - samsung,atna45dc02 + - const: samsung,atna33xc20 enable-gpios: true port: true From f39bae2e028b841732ca81d8131d27b48a6051ad Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2024 15:42:23 -0400 Subject: [PATCH 0680/1052] bcachefs: Switch to .get_inode_acl() .set_acl() requires a dentry, and if one isn't passed it marks the VFS inode as not having an ACL. This has been causing inodes with ACLs to have them "disappear" on bcachefs filesystem, depending on which path those inodes get pulled into the cache from. Switching to .get_inode_acl(), like other local filesystems, fixes this. Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 11 +++++++---- fs/bcachefs/acl.h | 2 +- fs/bcachefs/fs.c | 8 ++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index a7b425d3c8a0..331a17f3f113 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -272,16 +272,19 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, - struct dentry *dentry, int type) +struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; + + if (rcu) + return ERR_PTR(-ECHILD); + + struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index 27e7eec0f278..fe730a6bf0c1 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t); #ifdef CONFIG_BCACHEFS_POSIX_ACL -struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); +struct posix_acl *bch2_get_acl(struct inode *, int, bool); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 3a5f49affa0a..15fc41e63b6c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1199,7 +1199,7 @@ static const struct inode_operations bch_file_inode_operations = { .fiemap = bch2_fiemap, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1219,7 +1219,7 @@ static const struct inode_operations bch_dir_inode_operations = { .tmpfile = bch2_tmpfile, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1241,7 +1241,7 @@ static const struct inode_operations bch_symlink_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1251,7 +1251,7 @@ static const struct inode_operations bch_special_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; From cb5b81bc9a448f8db817566f60f92e2ea788ea0f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 8 Aug 2024 12:29:40 -0700 Subject: [PATCH 0681/1052] module: warn about excessively long module waits Russell King reported that the arm cbc(aes) crypto module hangs when loaded, and Herbert Xu bisected it to commit 9b9879fc0327 ("modules: catch concurrent module loads, treat them as idempotent"), and noted: "So what's happening here is that the first modprobe tries to load a fallback CBC implementation, in doing so it triggers a load of the exact same module due to module aliases. IOW we're loading aes-arm-bs which provides cbc(aes). However, this needs a fallback of cbc(aes) to operate, which is made out of the generic cbc module + any implementation of aes, or ecb(aes). The latter happens to also be provided by aes-arm-cb so that's why it tries to load the same module again" So loading the aes-arm-bs module ends up wanting to recursively load itself, and the recursive load then ends up waiting for the original module load to complete. This is a regression, in that it used to be that we just tried to load the module multiple times, and then as we went on to install it the second time we would instead just error out because the module name already existed. That is actually also exactly what the original "catch concurrent loads" patch did in commit 9828ed3f695a ("module: error out early on concurrent load of the same module file"), but it turns out that it ends up being racy, in that erroring out before the module has been fully initialized will cause failures in dependent module loading. See commit ac2263b588df (which was the revert of that "error out early") commit for details about why erroring out before the module has been initialized is actually fundamentally racy. Now, for the actual recursive module load (as opposed to just concurrently loading the same module twice), the race is not an issue. At the same time it's hard for the kernel to see that this is recursion, because the module load is always done from a usermode helper, so the recursion is not some simple callchain within the kernel. End result: this is not the real fix, but this at least adds a warning for the situation (admittedly much too late for all the debugging pain that Russell and Herbert went through) and if we can come to a resolution on how to detect the recursion properly, this re-organizes the code to make that easier. Link: https://lore.kernel.org/all/ZrFHLqvFqhzykuYw@shell.armlinux.org.uk/ Reported-by: Russell King Debugged-by: Herbert Xu Signed-off-by: Linus Torvalds --- kernel/module/main.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/kernel/module/main.c b/kernel/module/main.c index d9592195c5bb..6f4ec857bdef 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3183,15 +3183,28 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int if (!f || !(f->f_mode & FMODE_READ)) return -EBADF; - /* See if somebody else is doing the operation? */ - if (idempotent(&idem, file_inode(f))) { - wait_for_completion(&idem.complete); - return idem.ret; + /* Are we the winners of the race and get to do this? */ + if (!idempotent(&idem, file_inode(f))) { + int ret = init_module_from_file(f, uargs, flags); + return idempotent_complete(&idem, ret); } - /* Otherwise, we'll do it and complete others */ - return idempotent_complete(&idem, - init_module_from_file(f, uargs, flags)); + /* + * Somebody else won the race and is loading the module. + * + * We have to wait for it forever, since our 'idem' is + * on the stack and the list entry stays there until + * completed (but we could fix it under the idem_lock) + * + * It's also unclear what a real timeout might be, + * but we could maybe at least make this killable + * and remove the idem entry in that case? + */ + for (;;) { + if (wait_for_completion_timeout(&idem.complete, 10*HZ)) + return idem.ret; + pr_warn_once("module '%pD' taking a long time to load", f); + } } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) From 23a58b782f864951485d7a0018549729e007cb43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20St=C4=99pniak?= Date: Wed, 7 Aug 2024 02:12:19 +0200 Subject: [PATCH 0682/1052] ASoC: amd: yc: Support mic on Lenovo Thinkpad E14 Gen 6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lenovo Thinkpad E14 Gen 6 (model type 21M3) needs a quirk entry for internal mic to work. Signed-off-by: Krzysztof Stępniak Link: https://patch.msgid.link/20240807001219.1147-1-kfs.szk@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index f4bbfffe9fcb..d30752c0dab2 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -220,6 +220,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "21J6"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21M3"), + } + }, { .driver_data = &acp6x_card, .matches = { From 4684a2df9c5b3fc914377127faf2515aa9049093 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 7 Aug 2024 10:53:55 +0800 Subject: [PATCH 0683/1052] ASoC: codecs: ES8326: button detect issue We find that we need to set snd_jack_types to 0. If not, there will be a probability of button detection errors Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240807025356.24904-2-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index b246694ebb4f..be3c79232a31 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -843,6 +843,8 @@ static void es8326_jack_detect_handler(struct work_struct *work) es8326_disable_micbias(es8326->component); if (es8326->jack->status & SND_JACK_HEADPHONE) { dev_dbg(comp->dev, "Report hp remove event\n"); + snd_soc_jack_report(es8326->jack, 0, + SND_JACK_BTN_0 | SND_JACK_BTN_1 | SND_JACK_BTN_2); snd_soc_jack_report(es8326->jack, 0, SND_JACK_HEADSET); /* mute adc when mic path switch */ regmap_write(es8326->regmap, ES8326_ADC1_SRC, 0x44); From 6675e76a5c441b52b1b983ebb714122087020ebe Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 7 Aug 2024 19:02:27 +0200 Subject: [PATCH 0684/1052] ASoC: amd: yc: Add quirk entry for OMEN by HP Gaming Laptop 16-n0xxx Fix the missing mic on OMEN by HP Gaming Laptop 16-n0xxx by adding the quirk entry with the board ID 8A44. Cc: stable@vger.kernel.org Link: https://bugzilla.suse.com/show_bug.cgi?id=1227182 Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20240807170249.16490-1-tiwai@suse.de Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index d30752c0dab2..0523c16305db 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -416,6 +416,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_BOARD_NAME, "8A43"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "HP"), + DMI_MATCH(DMI_BOARD_NAME, "8A44"), + } + }, { .driver_data = &acp6x_card, .matches = { From 2f3e2c9eaafc272266344d777f8de44f8632e247 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 6 Aug 2024 13:49:28 +0200 Subject: [PATCH 0685/1052] ASoC: dt-bindings: qcom,wcd937x: Correct reset GPIO polarity in example The reset GPIO of WCD9370/WCD9375 is active low and that's how it is routed on typical boards, so correct the example DTS to use expected polarity. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240806114931.40090-1-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/qcom,wcd937x.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/sound/qcom,wcd937x.yaml b/Documentation/devicetree/bindings/sound/qcom,wcd937x.yaml index de397d879acc..f94203798f24 100644 --- a/Documentation/devicetree/bindings/sound/qcom,wcd937x.yaml +++ b/Documentation/devicetree/bindings/sound/qcom,wcd937x.yaml @@ -42,7 +42,7 @@ examples: pinctrl-names = "default", "sleep"; pinctrl-0 = <&wcd_reset_n>; pinctrl-1 = <&wcd_reset_n_sleep>; - reset-gpios = <&tlmm 83 GPIO_ACTIVE_HIGH>; + reset-gpios = <&tlmm 83 GPIO_ACTIVE_LOW>; vdd-buck-supply = <&vreg_l17b_1p8>; vdd-rxtx-supply = <&vreg_l18b_1p8>; vdd-px-supply = <&vreg_l18b_1p8>; From 55922275702e112652d314a9b6a6ca31d4b7252e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 6 Aug 2024 13:49:29 +0200 Subject: [PATCH 0686/1052] ASoC: dt-bindings: qcom,wcd934x: Correct reset GPIO polarity in example The reset GPIO of WCD9340/WCD9341 is active low and that's how it is routed on typical boards, so correct the example DTS to use expected polarity. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240806114931.40090-2-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml b/Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml index beb0ff0245b0..a65b1d1d5fdd 100644 --- a/Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml +++ b/Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml @@ -199,10 +199,11 @@ additionalProperties: false examples: - | + #include codec@1,0{ compatible = "slim217,250"; reg = <1 0>; - reset-gpios = <&tlmm 64 0>; + reset-gpios = <&tlmm 64 GPIO_ACTIVE_LOW>; slim-ifc-dev = <&wcd9340_ifd>; #sound-dai-cells = <1>; interrupt-parent = <&tlmm>; From 871f1a16fa3506487de24b05d68be45e9185e77a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 6 Aug 2024 13:49:30 +0200 Subject: [PATCH 0687/1052] ASoC: dt-bindings: qcom,wcd938x: Correct reset GPIO polarity in example The reset GPIO of WCD9380/WCD9385 is active low and that's how it is routed on typical boards, so correct the example DTS to use expected polarity. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240806114931.40090-3-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/qcom,wcd938x.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/sound/qcom,wcd938x.yaml b/Documentation/devicetree/bindings/sound/qcom,wcd938x.yaml index cf6c3787adfe..10531350c336 100644 --- a/Documentation/devicetree/bindings/sound/qcom,wcd938x.yaml +++ b/Documentation/devicetree/bindings/sound/qcom,wcd938x.yaml @@ -34,9 +34,10 @@ unevaluatedProperties: false examples: - | + #include codec { compatible = "qcom,wcd9380-codec"; - reset-gpios = <&tlmm 32 0>; + reset-gpios = <&tlmm 32 GPIO_ACTIVE_LOW>; #sound-dai-cells = <1>; qcom,tx-device = <&wcd938x_tx>; qcom,rx-device = <&wcd938x_rx>; From 81f88fddef9cddae6b4e5d9359022c7a2a3e3b6a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 6 Aug 2024 13:49:31 +0200 Subject: [PATCH 0688/1052] ASoC: dt-bindings: qcom,wcd939x: Correct reset GPIO polarity in example The reset GPIO of WCD9390/WCD9395 is active low and that's how it is routed on typical boards, so correct the example DTS to use expected polarity, instead of IRQ flag (which is a logical mistake on its own). Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240806114931.40090-4-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml b/Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml index 6e76f6a8634f..c69291f4d575 100644 --- a/Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml +++ b/Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml @@ -52,10 +52,10 @@ unevaluatedProperties: false examples: - | - #include + #include codec { compatible = "qcom,wcd9390-codec"; - reset-gpios = <&tlmm 32 IRQ_TYPE_NONE>; + reset-gpios = <&tlmm 32 GPIO_ACTIVE_LOW>; #sound-dai-cells = <1>; qcom,tx-device = <&wcd939x_tx>; qcom,rx-device = <&wcd939x_rx>; From 2f11f61f9d4d5692bcebb9d089429ee0c046e08a Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 7 Aug 2024 15:01:40 +0100 Subject: [PATCH 0689/1052] MAINTAINERS: Update Cirrus Logic parts to linux-sound mailing list Now that most kernel work on sound has moved over to the linux-sound mailing list so should the Cirrus Logic audio parts. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20240807140140.421359-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde38320..d304054d661e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5306,7 +5306,7 @@ F: drivers/media/cec/i2c/ch7322.c CIRRUS LOGIC AUDIO CODEC DRIVERS M: David Rhodes M: Richard Fitzgerald -L: alsa-devel@alsa-project.org (moderated for non-subscribers) +L: linux-sound@vger.kernel.org L: patches@opensource.cirrus.com S: Maintained F: Documentation/devicetree/bindings/sound/cirrus,cs* @@ -5375,7 +5375,7 @@ F: sound/soc/codecs/lochnagar-sc.c CIRRUS LOGIC MADERA CODEC DRIVERS M: Charles Keepax M: Richard Fitzgerald -L: alsa-devel@alsa-project.org (moderated for non-subscribers) +L: linux-sound@vger.kernel.org L: patches@opensource.cirrus.com S: Supported W: https://github.com/CirrusLogic/linux-drivers/wiki From 5003d0ce5c7da3a02c0aff771f516f99731e7390 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 7 Aug 2024 18:27:03 +0200 Subject: [PATCH 0690/1052] ASoC: meson: axg-fifo: fix irq scheduling issue with PREEMPT_RT With PREEMPT_RT enabled a spinlock_t becomes a sleeping lock. This is usually not a problem with spinlocks used in IRQ context since IRQ handlers get threaded. However, if IRQF_ONESHOT is set, the primary handler won't be force-threaded and runs always in hardirq context. This is a problem because spinlock_t requires a preemptible context on PREEMPT_RT. In this particular instance, regmap mmio uses spinlock_t to protect the register access and IRQF_ONESHOT is set on the IRQ. In this case, it is actually better to do everything in threaded handler and it solves the problem with PREEMPT_RT. Reported-by: Arseniy Krasnov Closes: https://lore.kernel.org/linux-amlogic/20240729131652.3012327-1-avkrasnov@salutedevices.com Suggested-by: Sebastian Andrzej Siewior Fixes: b11d26660dff ("ASoC: meson: axg-fifo: use threaded irq to check periods") Signed-off-by: Jerome Brunet Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240807162705.4024136-1-jbrunet@baylibre.com Signed-off-by: Mark Brown --- sound/soc/meson/axg-fifo.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/sound/soc/meson/axg-fifo.c b/sound/soc/meson/axg-fifo.c index 7e6090af720b..75909196b769 100644 --- a/sound/soc/meson/axg-fifo.c +++ b/sound/soc/meson/axg-fifo.c @@ -207,25 +207,18 @@ static irqreturn_t axg_fifo_pcm_irq_block(int irq, void *dev_id) status = FIELD_GET(STATUS1_INT_STS, status); axg_fifo_ack_irq(fifo, status); - /* Use the thread to call period elapsed on nonatomic links */ - if (status & FIFO_INT_COUNT_REPEAT) - return IRQ_WAKE_THREAD; + if (status & ~FIFO_INT_COUNT_REPEAT) + dev_dbg(axg_fifo_dev(ss), "unexpected irq - STS 0x%02x\n", + status); - dev_dbg(axg_fifo_dev(ss), "unexpected irq - STS 0x%02x\n", - status); + if (status & FIFO_INT_COUNT_REPEAT) { + snd_pcm_period_elapsed(ss); + return IRQ_HANDLED; + } return IRQ_NONE; } -static irqreturn_t axg_fifo_pcm_irq_block_thread(int irq, void *dev_id) -{ - struct snd_pcm_substream *ss = dev_id; - - snd_pcm_period_elapsed(ss); - - return IRQ_HANDLED; -} - int axg_fifo_pcm_open(struct snd_soc_component *component, struct snd_pcm_substream *ss) { @@ -251,8 +244,9 @@ int axg_fifo_pcm_open(struct snd_soc_component *component, if (ret) return ret; - ret = request_threaded_irq(fifo->irq, axg_fifo_pcm_irq_block, - axg_fifo_pcm_irq_block_thread, + /* Use the threaded irq handler only with non-atomic links */ + ret = request_threaded_irq(fifo->irq, NULL, + axg_fifo_pcm_irq_block, IRQF_ONESHOT, dev_name(dev), ss); if (ret) return ret; From 72776774b55bb59b7b1b09117e915a5030110304 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Wed, 7 Aug 2024 14:26:48 +0000 Subject: [PATCH 0691/1052] ASoC: cs35l56: Patch CS35L56_IRQ1_MASK_18 to the default value Device tuning files made with early revision tooling may contain configuration that can unmask IRQ signals that are owned by the host. Adding a safe default to the regmap patch ensures that the hardware matches the driver expectations. Signed-off-by: Simon Trimmer Link: https://patch.msgid.link/20240807142648.46932-1-simont@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-shared.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index e7e8d617da94..bd74fef33d49 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -36,6 +36,7 @@ static const struct reg_sequence cs35l56_patch[] = { { CS35L56_SWIRE_DP3_CH2_INPUT, 0x00000019 }, { CS35L56_SWIRE_DP3_CH3_INPUT, 0x00000029 }, { CS35L56_SWIRE_DP3_CH4_INPUT, 0x00000028 }, + { CS35L56_IRQ1_MASK_18, 0x1f7df0ff }, /* These are not reset by a soft-reset, so patch to defaults. */ { CS35L56_MAIN_RENDER_USER_MUTE, 0x00000000 }, From 0c84bde4f37ba27d50e4c70ecacd33fe4a57030d Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 8 Aug 2024 10:35:19 +0200 Subject: [PATCH 0692/1052] media: Revert "media: dvb-usb: Fix unexpected infinite loop in dvb_usb_read_remote_control()" This reverts commit 2052138b7da52ad5ccaf74f736d00f39a1c9198c. This breaks the TeVii s480 dual DVB-S2 S660. The device has a bulk in endpoint but no corresponding out endpoint, so the device does not pass the "has both receive and send bulk endpoint" test. Seemingly this device does not use dvb_usb_generic_rw() so I have tried removing the generic_bulk_ctrl_endpoint entry, but this resulted in different problems. As we have no explanation yet, revert. $ dmesg | grep -i -e dvb -e dw21 -e usb\ 4 [ 0.999122] usb 1-1: new high-speed USB device number 2 using ehci-pci [ 1.023123] usb 4-1: new high-speed USB device number 2 using ehci-pci [ 1.130247] usb 1-1: New USB device found, idVendor=9022, idProduct=d482, +bcdDevice= 0.01 [ 1.130257] usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 1.152323] usb 4-1: New USB device found, idVendor=9022, idProduct=d481, +bcdDevice= 0.01 [ 1.152329] usb 4-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 6.701033] dvb-usb: found a 'TeVii S480.2 USB' in cold state, will try to +load a firmware [ 6.701178] dvb-usb: downloading firmware from file 'dvb-usb-s660.fw' [ 6.701179] dw2102: start downloading DW210X firmware [ 6.703715] dvb-usb: found a 'Microsoft Xbox One Digital TV Tuner' in cold +state, will try to load a firmware [ 6.703974] dvb-usb: downloading firmware from file 'dvb-usb-dib0700-1.20.fw' [ 6.756432] usb 1-1: USB disconnect, device number 2 [ 6.862119] dvb-usb: found a 'TeVii S480.2 USB' in warm state. [ 6.862194] dvb-usb: TeVii S480.2 USB error while loading driver (-22) [ 6.862209] dvb-usb: found a 'TeVii S480.1 USB' in cold state, will try to +load a firmware [ 6.862244] dvb-usb: downloading firmware from file 'dvb-usb-s660.fw' [ 6.862245] dw2102: start downloading DW210X firmware [ 6.914811] usb 4-1: USB disconnect, device number 2 [ 7.014131] dvb-usb: found a 'TeVii S480.1 USB' in warm state. [ 7.014487] dvb-usb: TeVii S480.1 USB error while loading driver (-22) [ 7.014538] usbcore: registered new interface driver dw2102 Closes: https://lore.kernel.org/stable/20240801165146.38991f60@mir/ Fixes: 2052138b7da5 ("media: dvb-usb: Fix unexpected infinite loop in dvb_usb_read_remote_control()") Reported-by: Stefan Lippers-Hollmann Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/dvb-usb/dvb-usb-init.c | 35 +++--------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/drivers/media/usb/dvb-usb/dvb-usb-init.c b/drivers/media/usb/dvb-usb/dvb-usb-init.c index 22d83ac18eb7..fbf58012becd 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb-init.c +++ b/drivers/media/usb/dvb-usb/dvb-usb-init.c @@ -23,40 +23,11 @@ static int dvb_usb_force_pid_filter_usage; module_param_named(force_pid_filter_usage, dvb_usb_force_pid_filter_usage, int, 0444); MODULE_PARM_DESC(force_pid_filter_usage, "force all dvb-usb-devices to use a PID filter, if any (default: 0)."); -static int dvb_usb_check_bulk_endpoint(struct dvb_usb_device *d, u8 endpoint) -{ - if (endpoint) { - int ret; - - ret = usb_pipe_type_check(d->udev, usb_sndbulkpipe(d->udev, endpoint)); - if (ret) - return ret; - ret = usb_pipe_type_check(d->udev, usb_rcvbulkpipe(d->udev, endpoint)); - if (ret) - return ret; - } - return 0; -} - -static void dvb_usb_clear_halt(struct dvb_usb_device *d, u8 endpoint) -{ - if (endpoint) { - usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, endpoint)); - usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, endpoint)); - } -} - static int dvb_usb_adapter_init(struct dvb_usb_device *d, short *adapter_nrs) { struct dvb_usb_adapter *adap; int ret, n, o; - ret = dvb_usb_check_bulk_endpoint(d, d->props.generic_bulk_ctrl_endpoint); - if (ret) - return ret; - ret = dvb_usb_check_bulk_endpoint(d, d->props.generic_bulk_ctrl_endpoint_response); - if (ret) - return ret; for (n = 0; n < d->props.num_adapters; n++) { adap = &d->adapter[n]; adap->dev = d; @@ -132,8 +103,10 @@ static int dvb_usb_adapter_init(struct dvb_usb_device *d, short *adapter_nrs) * when reloading the driver w/o replugging the device * sometimes a timeout occurs, this helps */ - dvb_usb_clear_halt(d, d->props.generic_bulk_ctrl_endpoint); - dvb_usb_clear_halt(d, d->props.generic_bulk_ctrl_endpoint_response); + if (d->props.generic_bulk_ctrl_endpoint != 0) { + usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); + usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); + } return 0; From 05a3d6e9307250a5911d75308e4363466794ab21 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 8 Aug 2024 11:57:38 -0400 Subject: [PATCH 0693/1052] selinux: revert our use of vma_is_initial_heap() Unfortunately it appears that vma_is_initial_heap() is currently broken for applications that do not currently have any heap allocated, e.g. brk == start_brk. The breakage is such that it will cause SELinux to check for the process/execheap permission on memory regions that cross brk/start_brk even when there is no heap. The proper fix would be to correct vma_is_initial_heap(), but as there are multiple callers I am hesitant to unilaterally modify the helper out of concern that I would end up breaking some other subsystem. The mm developers have been made aware of the situation and hopefully they will have a fix at some point in the future, but we need a fix soon so we are simply going to revert our use of vma_is_initial_heap() in favor of our old logic/code which works as expected, even in the face of a zero size heap. We can return to using vma_is_initial_heap() at some point in the future when it is fixed. Cc: stable@vger.kernel.org Reported-by: Marc Reisner Closes: https://lore.kernel.org/all/ZrPmoLKJEf1wiFmM@marcreisner.com Fixes: 68df1baf158f ("selinux: use vma_is_initial_stack() and vma_is_initial_heap()") Signed-off-by: Paul Moore --- security/selinux/hooks.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 55c78c318ccd..bfa61e005aac 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3852,7 +3852,17 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; - if (vma_is_initial_heap(vma)) { + /* + * We don't use the vma_is_initial_heap() helper as it has + * a history of problems and is currently broken on systems + * where there is no heap, e.g. brk == start_brk. Before + * replacing the conditional below with vma_is_initial_heap(), + * or something similar, please ensure that the logic is the + * same as what we have below or you have tested every possible + * corner case you can think to test. + */ + if (vma->vm_start >= vma->vm_mm->start_brk && + vma->vm_end <= vma->vm_mm->brk) { rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); } else if (!vma->vm_file && (vma_is_initial_stack(vma) || From 36bb22a08a69d9984a8399c07310d18b115eae20 Mon Sep 17 00:00:00 2001 From: Gleb Korobeynikov Date: Thu, 8 Aug 2024 18:47:48 +0300 Subject: [PATCH 0694/1052] cifs: cifs_inval_name_dfs_link_error: correct the check for fullpath Replace the always-true check tcon->origin_fullpath with check of server->leaf_fullpath See https://bugzilla.kernel.org/show_bug.cgi?id=219083 The check of the new @tcon will always be true during mounting, since @tcon->origin_fullpath will only be set after the tree is connected to the latest common resource, as well as checking if the prefix paths from it are fully accessible. Fixes: 3ae872de4107 ("smb: client: fix shared DFS root mounts with different prefixes") Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Gleb Korobeynikov Signed-off-by: Steve French --- fs/smb/client/misc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 3fe5bfc389d0..c6f11e6f9eb9 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -1234,6 +1234,7 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, const char *full_path, bool *islink) { + struct TCP_Server_Info *server = tcon->ses->server; struct cifs_ses *ses = tcon->ses; size_t len; char *path; @@ -1250,12 +1251,12 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, !is_tcon_dfs(tcon)) return 0; - spin_lock(&tcon->tc_lock); - if (!tcon->origin_fullpath) { - spin_unlock(&tcon->tc_lock); + spin_lock(&server->srv_lock); + if (!server->leaf_fullpath) { + spin_unlock(&server->srv_lock); return 0; } - spin_unlock(&tcon->tc_lock); + spin_unlock(&server->srv_lock); /* * Slow path - tcon is DFS and @full_path has prefix path, so attempt From a018c1b636e79b60149b41151ded7c2606d8606e Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 5 Aug 2024 08:56:18 +0900 Subject: [PATCH 0695/1052] ksmbd: override fsids for share path check Sangsoo reported that a DAC denial error occurred when accessing files through the ksmbd thread. This patch override fsids for share path check. Reported-by: Sangsoo Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/mgmt/share_config.c | 15 ++++++++++++--- fs/smb/server/mgmt/share_config.h | 4 +++- fs/smb/server/mgmt/tree_connect.c | 9 +++++---- fs/smb/server/mgmt/tree_connect.h | 4 ++-- fs/smb/server/smb2pdu.c | 2 +- fs/smb/server/smb_common.c | 9 +++++++-- fs/smb/server/smb_common.h | 2 ++ 7 files changed, 32 insertions(+), 13 deletions(-) diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index e0a6b758094f..d8d03070ae44 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -15,6 +15,7 @@ #include "share_config.h" #include "user_config.h" #include "user_session.h" +#include "../connection.h" #include "../transport_ipc.h" #include "../misc.h" @@ -120,12 +121,13 @@ static int parse_veto_list(struct ksmbd_share_config *share, return 0; } -static struct ksmbd_share_config *share_config_request(struct unicode_map *um, +static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, const char *name) { struct ksmbd_share_config_response *resp; struct ksmbd_share_config *share = NULL; struct ksmbd_share_config *lookup; + struct unicode_map *um = work->conn->um; int ret; resp = ksmbd_ipc_share_config_request(name); @@ -181,7 +183,14 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um, KSMBD_SHARE_CONFIG_VETO_LIST(resp), resp->veto_list_sz); if (!ret && share->path) { + if (__ksmbd_override_fsids(work, share)) { + kill_share(share); + share = NULL; + goto out; + } + ret = kern_path(share->path, 0, &share->vfs_path); + ksmbd_revert_fsids(work); if (ret) { ksmbd_debug(SMB, "failed to access '%s'\n", share->path); @@ -214,7 +223,7 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um, return share; } -struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, +struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work, const char *name) { struct ksmbd_share_config *share; @@ -227,7 +236,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, if (share) return share; - return share_config_request(um, name); + return share_config_request(work, name); } bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h index 5f591751b923..d4ac2dd4de20 100644 --- a/fs/smb/server/mgmt/share_config.h +++ b/fs/smb/server/mgmt/share_config.h @@ -11,6 +11,8 @@ #include #include +struct ksmbd_work; + struct ksmbd_share_config { char *name; char *path; @@ -68,7 +70,7 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) __ksmbd_share_config_put(share); } -struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, +struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work, const char *name); bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, const char *filename); diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index d2c81a8a11dd..94a52a75014a 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -16,17 +16,18 @@ #include "user_session.h" struct ksmbd_tree_conn_status -ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - const char *share_name) +ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name) { struct ksmbd_tree_conn_status status = {-ENOENT, NULL}; struct ksmbd_tree_connect_response *resp = NULL; struct ksmbd_share_config *sc; struct ksmbd_tree_connect *tree_conn = NULL; struct sockaddr *peer_addr; + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; int ret; - sc = ksmbd_share_config_get(conn->um, share_name); + sc = ksmbd_share_config_get(work, share_name); if (!sc) return status; @@ -61,7 +62,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct ksmbd_share_config *new_sc; ksmbd_share_config_del(sc); - new_sc = ksmbd_share_config_get(conn->um, share_name); + new_sc = ksmbd_share_config_get(work, share_name); if (!new_sc) { pr_err("Failed to update stale share config\n"); status.ret = -ESTALE; diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h index 6377a70b811c..a42cdd051041 100644 --- a/fs/smb/server/mgmt/tree_connect.h +++ b/fs/smb/server/mgmt/tree_connect.h @@ -13,6 +13,7 @@ struct ksmbd_share_config; struct ksmbd_user; struct ksmbd_conn; +struct ksmbd_work; enum { TREE_NEW = 0, @@ -50,8 +51,7 @@ static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn, struct ksmbd_session; struct ksmbd_tree_conn_status -ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - const char *share_name); +ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name); void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 37a39ab4ee65..54154d36ea2f 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1955,7 +1955,7 @@ int smb2_tree_connect(struct ksmbd_work *work) ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n", name, treename); - status = ksmbd_tree_conn_connect(conn, sess, name); + status = ksmbd_tree_conn_connect(work, name); if (status.ret == KSMBD_TREE_CONN_STATUS_OK) rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id); else diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 474dadf6b7b8..13818ecb6e1b 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -732,10 +732,10 @@ bool is_asterisk(char *p) return p && p[0] == '*'; } -int ksmbd_override_fsids(struct ksmbd_work *work) +int __ksmbd_override_fsids(struct ksmbd_work *work, + struct ksmbd_share_config *share) { struct ksmbd_session *sess = work->sess; - struct ksmbd_share_config *share = work->tcon->share_conf; struct cred *cred; struct group_info *gi; unsigned int uid; @@ -775,6 +775,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work) return 0; } +int ksmbd_override_fsids(struct ksmbd_work *work) +{ + return __ksmbd_override_fsids(work, work->tcon->share_conf); +} + void ksmbd_revert_fsids(struct ksmbd_work *work) { const struct cred *cred; diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index f1092519c0c2..4a3148b0167f 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -447,6 +447,8 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command); int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp); +int __ksmbd_override_fsids(struct ksmbd_work *work, + struct ksmbd_share_config *share); int ksmbd_override_fsids(struct ksmbd_work *work); void ksmbd_revert_fsids(struct ksmbd_work *work); From f6bd41280a44dcc2e0a25ed72617d25f586974a7 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 5 Aug 2024 08:57:03 +0900 Subject: [PATCH 0696/1052] ksmbd: override fsids for smb2_query_info() Sangsoo reported that a DAC denial error occurred when accessing files through the ksmbd thread. This patch override fsids for smb2_query_info(). Reported-by: Sangsoo Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 54154d36ea2f..2df1354288e6 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5596,6 +5596,11 @@ int smb2_query_info(struct ksmbd_work *work) ksmbd_debug(SMB, "GOT query info request\n"); + if (ksmbd_override_fsids(work)) { + rc = -ENOMEM; + goto err_out; + } + switch (req->InfoType) { case SMB2_O_INFO_FILE: ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n"); @@ -5614,6 +5619,7 @@ int smb2_query_info(struct ksmbd_work *work) req->InfoType); rc = -EOPNOTSUPP; } + ksmbd_revert_fsids(work); if (!rc) { rsp->StructureSize = cpu_to_le16(9); @@ -5623,6 +5629,7 @@ int smb2_query_info(struct ksmbd_work *work) le32_to_cpu(rsp->OutputBufferLength)); } +err_out: if (rc < 0) { if (rc == -EACCES) rsp->hdr.Status = STATUS_ACCESS_DENIED; From e5876b088ba03a62124266fa20d00e65533c7269 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 6 Aug 2024 19:28:05 +0200 Subject: [PATCH 0697/1052] usbnet: ipheth: race between ipheth_close and error handling ipheth_sndbulk_callback() can submit carrier_work as a part of its error handling. That means that the driver must make sure that the work is cancelled after it has made sure that no more URB can terminate with an error condition. Hence the order of actions in ipheth_close() needs to be inverted. Signed-off-by: Oliver Neukum Signed-off-by: Foster Snowhill Tested-by: Georgi Valkov Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 687d70cfc556..6eeef10edada 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -475,8 +475,8 @@ static int ipheth_close(struct net_device *net) { struct ipheth_device *dev = netdev_priv(net); - cancel_delayed_work_sync(&dev->carrier_work); netif_stop_queue(net); + cancel_delayed_work_sync(&dev->carrier_work); return 0; } From 655b46d7a39ac6f049698b27c1568c0f7ff85d1e Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:06 +0200 Subject: [PATCH 0698/1052] usbnet: ipheth: remove extraneous rx URB length check Rx URB length was already checked in ipheth_rcvbulk_callback_legacy() and ipheth_rcvbulk_callback_ncm(), depending on the current mode. The check in ipheth_rcvbulk_callback() was thus mostly a duplicate. The only place in ipheth_rcvbulk_callback() where we care about the URB length is for the initial control frame. These frames are always 4 bytes long. This has been checked as far back as iOS 4.2.1 on iPhone 3G. Remove the extraneous URB length check. For control frames, check for the specific 4-byte length instead. Signed-off-by: Foster Snowhill Tested-by: Georgi Valkov Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 6eeef10edada..017255615508 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -286,11 +286,6 @@ static void ipheth_rcvbulk_callback(struct urb *urb) return; } - if (urb->actual_length <= IPHETH_IP_ALIGN) { - dev->net->stats.rx_length_errors++; - return; - } - /* RX URBs starting with 0x00 0x01 do not encapsulate Ethernet frames, * but rather are control frames. Their purpose is not documented, and * they don't affect driver functionality, okay to drop them. @@ -298,7 +293,8 @@ static void ipheth_rcvbulk_callback(struct urb *urb) * URB received from the bulk IN endpoint. */ if (unlikely - (((char *)urb->transfer_buffer)[0] == 0 && + (urb->actual_length == 4 && + ((char *)urb->transfer_buffer)[0] == 0 && ((char *)urb->transfer_buffer)[1] == 1)) goto rx_submit; From 94d7eeb6c0ef0310992944f0d0296929816a2cb0 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:07 +0200 Subject: [PATCH 0699/1052] usbnet: ipheth: drop RX URBs with no payload On iPhone 15 Pro Max one can observe periodic URBs with no payload on the "bulk in" (RX) endpoint. These don't seem to do anything meaningful. Reproduced on iOS 17.5.1 and 17.6. This behaviour isn't observed on iPhone 11 on the same iOS version. The nature of these zero-length URBs is so far unknown. Drop RX URBs with no payload. Signed-off-by: Foster Snowhill Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 017255615508..f04c7bf79665 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -286,6 +286,12 @@ static void ipheth_rcvbulk_callback(struct urb *urb) return; } + /* iPhone may periodically send URBs with no payload + * on the "bulk in" endpoint. It is safe to ignore them. + */ + if (urb->actual_length == 0) + goto rx_submit; + /* RX URBs starting with 0x00 0x01 do not encapsulate Ethernet frames, * but rather are control frames. Their purpose is not documented, and * they don't affect driver functionality, okay to drop them. From 74efed51e0a4d62f998f806c307778b47fc73395 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:08 +0200 Subject: [PATCH 0700/1052] usbnet: ipheth: do not stop RX on failing RX callback RX callbacks can fail for multiple reasons: * Payload too short * Payload formatted incorrecly (e.g. bad NCM framing) * Lack of memory None of these should cause the driver to seize up. Make such failures non-critical and continue processing further incoming URBs. Signed-off-by: Foster Snowhill Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index f04c7bf79665..cdc72559790a 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -308,7 +308,6 @@ static void ipheth_rcvbulk_callback(struct urb *urb) if (retval != 0) { dev_err(&dev->intf->dev, "%s: callback retval: %d\n", __func__, retval); - return; } rx_submit: From 67927a1b255d883881be9467508e0af9a5e0be9d Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:09 +0200 Subject: [PATCH 0701/1052] usbnet: ipheth: fix carrier detection in modes 1 and 4 Apart from the standard "configurations", "interfaces" and "alternate interface settings" in USB, iOS devices also have a notion of "modes". In different modes, the device exposes a different set of available configurations. Depending on the iOS version, and depending on the current mode, the length and contents of the carrier state control message differs: * 1 byte (seen on iOS 4.2.1, 8.4): * 03: carrier off (mode 0) * 04: carrier on (mode 0) * 3 bytes (seen on iOS 10.3.4, 15.7.6): * 03 03 03: carrier off (mode 0) * 04 04 03: carrier on (mode 0) * 4 bytes (seen on iOS 16.5, 17.6): * 03 03 03 00: carrier off (mode 0) * 04 03 03 00: carrier off (mode 1) * 06 03 03 00: carrier off (mode 4) * 04 04 03 04: carrier on (mode 0 and 1) * 06 04 03 04: carrier on (mode 4) Before this change, the driver always used the first byte of the response to determine carrier state. From this larger sample, the first byte seems to indicate the number of available USB configurations in the current mode (with the exception of the default mode 0), and in some cases (namely mode 1 and 4) does not correlate with the carrier state. Previous logic erroneously counted `04 03 03 00` as "carrier on" and `06 04 03 04` as "carrier off" on iOS versions that support mode 1 and mode 4 respectively. Only modes 0, 1 and 4 expose the USB Ethernet interfaces necessary for the ipheth driver. Check the second byte of the control message where possible, and fall back to checking the first byte on older iOS versions. Signed-off-by: Foster Snowhill Tested-by: Georgi Valkov Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index cdc72559790a..46afb95ffabe 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -355,13 +355,14 @@ static int ipheth_carrier_set(struct ipheth_device *dev) 0x02, /* index */ dev->ctrl_buf, IPHETH_CTRL_BUF_SIZE, IPHETH_CTRL_TIMEOUT); - if (retval < 0) { + if (retval <= 0) { dev_err(&dev->intf->dev, "%s: usb_control_msg: %d\n", __func__, retval); return retval; } - if (dev->ctrl_buf[0] == IPHETH_CARRIER_ON) { + if ((retval == 1 && dev->ctrl_buf[0] == IPHETH_CARRIER_ON) || + (retval >= 2 && dev->ctrl_buf[1] == IPHETH_CARRIER_ON)) { netif_carrier_on(dev->net); if (dev->tx_urb->status != -EINPROGRESS) netif_wake_queue(dev->net); From 2124d84db293ba164059077944e6b429ba530495 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 9 Aug 2024 08:33:28 -0700 Subject: [PATCH 0702/1052] module: make waiting for a concurrent module loader interruptible The recursive aes-arm-bs module load situation reported by Russell King is getting fixed in the crypto layer, but this in the meantime fixes the "recursive load hangs forever" by just making the waiting for the first module load be interruptible. This should now match the old behavior before commit 9b9879fc0327 ("modules: catch concurrent module loads, treat them as idempotent"), which used the different "wait for module to be ready" code in module_patient_check_exists(). End result: a recursive module load will still block, but now a signal will interrupt it and fail the second module load, at which point the first module will successfully complete loading. Fixes: 9b9879fc0327 ("modules: catch concurrent module loads, treat them as idempotent") Cc: Russell King Cc: Herbert Xu Signed-off-by: Linus Torvalds --- kernel/module/main.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/kernel/module/main.c b/kernel/module/main.c index 6f4ec857bdef..71396e297499 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3104,7 +3104,7 @@ static bool idempotent(struct idempotent *u, const void *cookie) struct idempotent *existing; bool first; - u->ret = 0; + u->ret = -EINTR; u->cookie = cookie; init_completion(&u->complete); @@ -3140,7 +3140,7 @@ static int idempotent_complete(struct idempotent *u, int ret) hlist_for_each_entry_safe(pos, next, head, entry) { if (pos->cookie != cookie) continue; - hlist_del(&pos->entry); + hlist_del_init(&pos->entry); pos->ret = ret; complete(&pos->complete); } @@ -3148,6 +3148,28 @@ static int idempotent_complete(struct idempotent *u, int ret) return ret; } +/* + * Wait for the idempotent worker. + * + * If we get interrupted, we need to remove ourselves from the + * the idempotent list, and the completion may still come in. + * + * The 'idem_lock' protects against the race, and 'idem.ret' was + * initialized to -EINTR and is thus always the right return + * value even if the idempotent work then completes between + * the wait_for_completion and the cleanup. + */ +static int idempotent_wait_for_completion(struct idempotent *u) +{ + if (wait_for_completion_interruptible(&u->complete)) { + spin_lock(&idem_lock); + if (!hlist_unhashed(&u->entry)) + hlist_del(&u->entry); + spin_unlock(&idem_lock); + } + return u->ret; +} + static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { struct load_info info = { }; @@ -3191,20 +3213,8 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int /* * Somebody else won the race and is loading the module. - * - * We have to wait for it forever, since our 'idem' is - * on the stack and the list entry stays there until - * completed (but we could fix it under the idem_lock) - * - * It's also unclear what a real timeout might be, - * but we could maybe at least make this killable - * and remove the idem entry in that case? */ - for (;;) { - if (wait_for_completion_timeout(&idem.complete, 10*HZ)) - return idem.ret; - pr_warn_once("module '%pD' taking a long time to load", f); - } + return idempotent_wait_for_completion(&idem); } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) From d0949cd44a62c4c41b30ea7ae94d8c887f586882 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Aug 2024 23:57:30 -0400 Subject: [PATCH 0703/1052] tracing: Return from tracing_buffers_read() if the file has been closed When running the following: # cd /sys/kernel/tracing/ # echo 1 > events/sched/sched_waking/enable # echo 1 > events/sched/sched_switch/enable # echo 0 > tracing_on # dd if=per_cpu/cpu0/trace_pipe_raw of=/tmp/raw0.dat The dd task would get stuck in an infinite loop in the kernel. What would happen is the following: When ring_buffer_read_page() returns -1 (no data) then a check is made to see if the buffer is empty (as happens when the page is not full), it will call wait_on_pipe() to wait until the ring buffer has data. When it is it will try again to read data (unless O_NONBLOCK is set). The issue happens when there's a reader and the file descriptor is closed. The wait_on_pipe() will return when that is the case. But this loop will continue to try again and wait_on_pipe() will again return immediately and the loop will continue and never stop. Simply check if the file was closed before looping and exit out if it is. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240808235730.78bf63e5@rorschach.local.home Fixes: 2aa043a55b9a7 ("tracing/ring-buffer: Fix wait_on_pipe() race") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 10cd38bce2f1..ebe7ce2f5f4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7956,7 +7956,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, trace_access_unlock(iter->cpu_file); if (ret < 0) { - if (trace_empty(iter)) { + if (trace_empty(iter) && !iter->closed) { if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; From 90574d2a675947858b47008df8d07f75ea50d0d0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 9 Aug 2024 15:34:30 +0300 Subject: [PATCH 0704/1052] rtla/osnoise: Prevent NULL dereference in error handling If the "tool->data" allocation fails then there is no need to call osnoise_free_top() and, in fact, doing so will lead to a NULL dereference. Cc: stable@vger.kernel.org Cc: John Kacur Cc: "Luis Claudio R. Goncalves" Cc: Clark Williams Fixes: 1eceb2fc2ca5 ("rtla/osnoise: Add osnoise top mode") Link: https://lore.kernel.org/f964ed1f-64d2-4fde-ad3e-708331f8f358@stanley.mountain Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/osnoise_top.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index f594a44df840..2f756628613d 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -651,8 +651,10 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params) return NULL; tool->data = osnoise_alloc_top(nr_cpus); - if (!tool->data) - goto out_err; + if (!tool->data) { + osnoise_destroy_tool(tool); + return NULL; + } tool->params = params; @@ -660,11 +662,6 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params) osnoise_top_handler, NULL); return tool; - -out_err: - osnoise_free_top(tool->data); - osnoise_destroy_tool(tool); - return NULL; } static int stop_tracing; From 077e47372309dcbe3a150754ea9c6f15cc838d6b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Aug 2024 23:19:59 -0400 Subject: [PATCH 0705/1052] bcachefs: bch2_accounting_invalid() Implement bch2_accounting_invalid(); check for junk at the end, and replicas accounting entries in particular need to be checked or we'll pop asserts later. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 65 +++++++++++++++++++++++++++- fs/bcachefs/disk_accounting_format.h | 9 ++-- fs/bcachefs/replicas.c | 1 - fs/bcachefs/sb-errors_format.h | 6 ++- 4 files changed, 73 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index dcdd59249c23..046ac92b6639 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -114,11 +114,74 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans, return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); } +static inline bool is_zero(char *start, char *end) +{ + BUG_ON(start > end); + + for (; start < end; start++) + if (*start) + return false; + return true; +} + +#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) + int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags, struct printbuf *err) { - return 0; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + void *end = &acc_k + 1; + int ret = 0; + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_nr_inodes: + end = field_end(acc_k, nr_inodes); + break; + case BCH_DISK_ACCOUNTING_persistent_reserved: + end = field_end(acc_k, persistent_reserved); + break; + case BCH_DISK_ACCOUNTING_replicas: + bkey_fsck_err_on(!acc_k.replicas.nr_devs, + c, err, accounting_key_replicas_nr_devs_0, + "accounting key replicas entry with nr_devs=0"); + + bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || + (acc_k.replicas.nr_required > 1 && + acc_k.replicas.nr_required == acc_k.replicas.nr_devs), + c, err, accounting_key_replicas_nr_required_bad, + "accounting key replicas entry with bad nr_required"); + + for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) + bkey_fsck_err_on(acc_k.replicas.devs[i] > acc_k.replicas.devs[i + 1], + c, err, accounting_key_replicas_devs_unsorted, + "accounting key replicas entry with unsorted devs"); + + end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + end = field_end(acc_k, dev_data_type); + break; + case BCH_DISK_ACCOUNTING_compression: + end = field_end(acc_k, compression); + break; + case BCH_DISK_ACCOUNTING_snapshot: + end = field_end(acc_k, snapshot); + break; + case BCH_DISK_ACCOUNTING_btree: + end = field_end(acc_k, btree); + break; + case BCH_DISK_ACCOUNTING_rebalance_work: + end = field_end(acc_k, rebalance_work); + break; + } + + bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), + c, err, accounting_key_junk_at_end, + "junk at end of accounting key"); +fsck_err: + return ret; } void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index cba417060b33..848f06cc809d 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -124,10 +124,6 @@ struct bch_dev_data_type { __u8 data_type; }; -struct bch_dev_stripe_buckets { - __u8 dev; -}; - struct bch_acct_compression { __u8 type; }; @@ -140,6 +136,9 @@ struct bch_acct_btree { __u32 id; }; +struct bch_acct_rebalance_work { +}; + struct disk_accounting_pos { union { struct { @@ -149,10 +148,10 @@ struct disk_accounting_pos { struct bch_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; struct bch_dev_data_type dev_data_type; - struct bch_dev_stripe_buckets dev_stripe_buckets; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; + struct bch_acct_rebalance_work rebalance_work; }; }; struct bpos _pad; diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 10c96cb2047a..1223b710755d 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -24,7 +24,6 @@ static int bch2_memcmp(const void *l, const void *r, const void *priv) static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(e->data_type >= BCH_DATA_NR); BUG_ON(!e->nr_devs); BUG_ON(e->nr_required > 1 && e->nr_required >= e->nr_devs); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d1b2f2aa397a..d3a498617303 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -287,7 +287,11 @@ enum bch_fsck_flags { x(accounting_replicas_not_marked, 273, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ - x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ + x(accounting_key_junk_at_end, 277, 0) \ + x(accounting_key_replicas_nr_devs_0, 278, 0) \ + x(accounting_key_replicas_nr_required_bad, 279, 0) \ + x(accounting_key_replicas_devs_unsorted, 280, 0) \ enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From 1a9e219db15e62760cfcc107ab6df3796d353605 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Aug 2024 23:44:00 -0400 Subject: [PATCH 0706/1052] bcachefs: improve bch2_dev_usage_to_text() Add a line for capacity Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 +- fs/bcachefs/buckets.c | 12 ++++++++---- fs/bcachefs/buckets.h | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 02de5ad2be2c..8563c2d26847 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1740,7 +1740,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); - bch2_dev_usage_to_text(out, &stats); + bch2_dev_usage_to_text(out, ca, &stats); prt_newline(out); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 2650a0d24663..9f7004e941ce 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -71,17 +71,21 @@ bch2_fs_usage_read_short(struct bch_fs *c) return ret; } -void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) +void bch2_dev_usage_to_text(struct printbuf *out, + struct bch_dev *ca, + struct bch_dev_usage *usage) { prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { bch2_prt_data_type(out, i); prt_printf(out, "\t%llu\r%llu\r%llu\r\n", - usage->d[i].buckets, - usage->d[i].sectors, - usage->d[i].fragmented); + usage->d[i].buckets, + usage->d[i].sectors, + usage->d[i].fragmented); } + + prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets); } static int bch2_check_fix_ptr(struct btree_trans *trans, diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 2d35eeb24a2d..edbdffd508fc 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -212,7 +212,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) return ret; } -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *); +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { From d5240fa65db071909e9d1d5adcc5fd1abc8e96fe Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 9 Aug 2024 11:11:55 +0800 Subject: [PATCH 0707/1052] nvdimm/pmem: Set dax flag for all 'PFN_MAP' cases The dax is only supported on pfn type pmem devices since commit f467fee48da4 ("block: move the dax flag to queue_limits"). Trying to mount DAX filesystem fails with this error: mount: : wrong fs type, bad option, bad superblock on /dev/pmem7, missing codepage or helper program, or other error. dmesg(1) may have more information after failed mount system call. dmesg: EXT4-fs (pmem7): DAX unsupported by block device. Fix the problem by adding dax flag setting for the missed case. Fixes: f467fee48da4 ("block: move the dax flag to queue_limits") Signed-off-by: Zhihao Cheng Reviewed-by: Christoph Hellwig Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Tested-by: Ira Weiny Tested-by: Alison Schofield Link: https://patch.msgid.link/20240809031155.2837271-1-chengzhihao1@huawei.com Signed-off-by: Ira Weiny --- drivers/nvdimm/pmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 1ae8b2351654..210fb77f51ba 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -498,7 +498,7 @@ static int pmem_attach_disk(struct device *dev, } if (fua) lim.features |= BLK_FEAT_FUA; - if (is_nd_pfn(dev)) + if (is_nd_pfn(dev) || pmem_should_map_pages(dev)) lim.features |= BLK_FEAT_DAX; if (!devm_request_mem_region(dev, res->start, resource_size(res), From 869b5016e94eced02f2cf99bf53c69b49adcee32 Mon Sep 17 00:00:00 2001 From: Zehui Xu Date: Wed, 31 Jul 2024 16:43:46 +0300 Subject: [PATCH 0708/1052] kbuild: rust: skip -fmin-function-alignment in bindgen flags GCC 14 recently added -fmin-function-alignment option and the root Makefile uses it to replace -falign-functions when available. However, this flag can cause issues when passed to the Rust Makefile and affect the bindgen process. Bindgen relies on libclang to parse C code, and currently does not support the -fmin-function-alignment flag, leading to compilation failures when GCC 14 is used. This patch addresses the issue by adding -fmin-function-alignment to the bindgen_skip_c_flags in rust/Makefile. This prevents the flag from causing compilation issues. [ Matthew and Gary confirm function alignment should not change the ABI in a way that bindgen would care about, thus we did not need the extra logic for bindgen from v2. - Miguel ] Link: https://lore.kernel.org/linux-kbuild/20240222133500.16991-1-petr.pavlu@suse.com/ Signed-off-by: Zehui Xu Reviewed-by: Alice Ryhl Reviewed-by: Neal Gompa Reviewed-by: Gary Guo Link: https://lore.kernel.org/r/20240731134346.10630-1-zehuixu@whu.edu.cn [ Reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index 6c0644b6090c..c41bae7ca8a3 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -227,7 +227,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ -fno-reorder-blocks -fno-allow-store-data-races -fasan-shadow-offset=% \ -fzero-call-used-regs=% -fno-stack-clash-protection \ -fno-inline-functions-called-once -fsanitize=bounds-strict \ - -fstrict-flex-arrays=% \ + -fstrict-flex-arrays=% -fmin-function-alignment=% \ --param=% --param asan-% # Derived from `scripts/Makefile.clang`. From 02dfd63afe65f7bacad543ba2b10f77083ae7929 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 6 Aug 2024 17:06:19 +0200 Subject: [PATCH 0709/1052] rust: add intrinsics to fix `-Os` builds Alice reported [1] that an arm64 build failed with: ld.lld: error: undefined symbol: __extendsfdf2 >>> referenced by core.a6f5fc5794e7b7b3-cgu.0 >>> rust/core.o:(::midpoint) in archive vmlinux.a >>> referenced by core.a6f5fc5794e7b7b3-cgu.0 >>> rust/core.o:(::midpoint) in archive vmlinux.a ld.lld: error: undefined symbol: __truncdfsf2 >>> referenced by core.a6f5fc5794e7b7b3-cgu.0 >>> rust/core.o:(::midpoint) in archive vmlinux.a Rust 1.80.0 or later together with `CONFIG_CC_OPTIMIZE_FOR_SIZE=y` is what triggers it. In addition, x86_64 builds also fail the same way. Similarly, compiling with Rust 1.82.0 (currently in nightly) makes another one appear, possibly due to the LLVM 19 upgrade there: ld.lld: error: undefined symbol: __eqdf2 >>> referenced by core.20495ea57a9f069d-cgu.0 >>> rust/core.o:(::next_up) in archive vmlinux.a >>> referenced by core.20495ea57a9f069d-cgu.0 >>> rust/core.o:(::next_down) in archive vmlinux.a Gary adds [1]: > Usually the fix on rustc side is to mark those functions as `#[inline]` > > All of {midpoint,next_up,next_down} are indeed unstable functions not > marked as inline... Fix all those by adding those intrinsics to our usual workaround. [ Trevor quickly submitted a fix to upstream Rust [2] that has already been merged, to be released in Rust 1.82.0 (2024-10-17). - Miguel ] Cc: Gary Guo Reported-by: Alice Ryhl Closes: https://rust-for-linux.zulipchat.com/#narrow/stream/x/topic/x/near/455637364 [1] Reviewed-by: Trevor Gross Tested-by: Alice Ryhl Tested-by: Boqun Feng Reviewed-by: Gary Guo Link: https://github.com/rust-lang/rust/pull/128749 [2] Link: https://lore.kernel.org/r/20240806150619.192882-1-ojeda@kernel.org [ Shortened Zulip link. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 4 ++-- rust/compiler_builtins.rs | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index c41bae7ca8a3..8de3ebba9551 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -354,8 +354,8 @@ rust-analyzer: $(if $(KBUILD_EXTMOD),$(extmod_prefix),$(objtree))/rust-project.json redirect-intrinsics = \ - __addsf3 __eqsf2 __gesf2 __lesf2 __ltsf2 __mulsf3 __nesf2 __unordsf2 \ - __adddf3 __ledf2 __ltdf2 __muldf3 __unorddf2 \ + __addsf3 __eqsf2 __extendsfdf2 __gesf2 __lesf2 __ltsf2 __mulsf3 __nesf2 __truncdfsf2 __unordsf2 \ + __adddf3 __eqdf2 __ledf2 __ltdf2 __muldf3 __unorddf2 \ __muloti4 __multi3 \ __udivmodti4 __udivti3 __umodti3 diff --git a/rust/compiler_builtins.rs b/rust/compiler_builtins.rs index bba2922c6ef7..f14b8d7caf89 100644 --- a/rust/compiler_builtins.rs +++ b/rust/compiler_builtins.rs @@ -40,16 +40,19 @@ pub extern "C" fn $ident() { define_panicking_intrinsics!("`f32` should not be used", { __addsf3, __eqsf2, + __extendsfdf2, __gesf2, __lesf2, __ltsf2, __mulsf3, __nesf2, + __truncdfsf2, __unordsf2, }); define_panicking_intrinsics!("`f64` should not be used", { __adddf3, + __eqdf2, __ledf2, __ltdf2, __muldf3, From d734422b7dd7d033fc02e421924e70dabf665b4c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 7 Aug 2024 01:35:59 +0200 Subject: [PATCH 0710/1052] kbuild: rust-analyzer: mark `rust_is_available.sh` invocation as recursive When calling the `rust_is_available.sh` script, we need to make the jobserver available to it, as commit ecab4115c44c ("kbuild: mark `rustc` (and others) invocations as recursive") explains and did for the others. Otherwise, we get a warning from `rustc` when calling `make rust-analyzer` with parallel jobs, e.g. `-j8`. Using several jobs for that target does not really matter, but developers may call `make` with jobs enabled in all cases. Thus fix it. Fixes: 6dc9d9ca9a72 ("kbuild: rust-analyzer: better error handling") Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20240806233559.246705-1-ojeda@kernel.org [ Reworded to add a couple more details mentioned in the list. - Miguel ] Signed-off-by: Miguel Ojeda --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8ad55d6e7b60..8ed76eb9d438 100644 --- a/Makefile +++ b/Makefile @@ -1963,7 +1963,7 @@ tags TAGS cscope gtags: FORCE # Protocol). PHONY += rust-analyzer rust-analyzer: - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh + +$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh $(Q)$(MAKE) $(build)=rust $@ # Script to generate missing namespace dependencies From 0eba65f0310d3c7d5516c7fd4c172d0bfa8b285b Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 6 Aug 2024 16:45:58 +0200 Subject: [PATCH 0711/1052] rust: x86: remove `-3dnow{,a}` from target features LLVM 19 is dropping support for 3DNow! in commit f0eb5587ceeb ("Remove support for 3DNow!, both intrinsics and builtins. (#96246)"): Remove support for 3DNow!, both intrinsics and builtins. (#96246) This set of instructions was only supported by AMD chips starting in the K6-2 (introduced 1998), and before the "Bulldozer" family (2011). They were never much used, as they were effectively superseded by the more-widely-implemented SSE (first implemented on the AMD side in Athlon XP in 2001). This is being done as a predecessor towards general removal of MMX register usage. Since there is almost no usage of the 3DNow! intrinsics, and no modern hardware even implements them, simple removal seems like the best option. Thus we should avoid passing these to the backend, since otherwise we get a diagnostic about it: '-3dnow' is not a recognized feature for this target (ignoring feature) '-3dnowa' is not a recognized feature for this target (ignoring feature) We could try to disable them only up to LLVM 19 (not the C side one, but the one used by `rustc`, which may be built with a range of LLVMs). However, to avoid more complexity, we can likely just remove them altogether. According to Nikita [2]: > I don't think it's needed because LLVM should not generate 3dnow > instructions unless specifically asked to, using intrinsics that > Rust does not provide in the first place. Thus do so, like Rust did for one of their builtin targets [3]. For those curious: Clang will warn only about trying to enable them (`-m3dnow{,a}`), but not about disabling them (`-mno-3dnow{,a}`), so there is no change needed there. Cc: Nikita Popov Cc: Nathan Chancellor Cc: x86@kernel.org Link: https://github.com/llvm/llvm-project/commit/f0eb5587ceeb641445b64cb264c822b4751de04a [1] Link: https://github.com/rust-lang/rust/pull/127864#issuecomment-2235898760 [2] Link: https://github.com/rust-lang/rust/pull/127864 [3] Closes: https://github.com/Rust-for-Linux/linux/issues/1094 Tested-by: Benno Lossin Tested-by: Alice Ryhl Link: https://lore.kernel.org/r/20240806144558.114461-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- scripts/generate_rust_target.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs index 87f34925eb7b..404edf7587e0 100644 --- a/scripts/generate_rust_target.rs +++ b/scripts/generate_rust_target.rs @@ -162,7 +162,7 @@ fn main() { "data-layout", "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", ); - let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string(); + let mut features = "-mmx,+soft-float".to_string(); if cfg.has("MITIGATION_RETPOLINE") { features += ",+retpoline-external-thunk"; } @@ -179,7 +179,7 @@ fn main() { "data-layout", "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128", ); - let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string(); + let mut features = "-mmx,+soft-float".to_string(); if cfg.has("MITIGATION_RETPOLINE") { features += ",+retpoline-external-thunk"; } From 8c251c5ab1b7cd204231e4ee936bfe078a33f234 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 9 Aug 2024 08:27:49 +0000 Subject: [PATCH 0712/1052] cxl/pci: Get AER capability address from RCRB only for RCH dport cxl_setup_parent_dport() needs to get RCH dport AER capability address from RCRB to disable AER interrupt. The function does not check if dport is RCH dport, it will get a wrong pci_host_bridge structure by dport_dev in VH case because dport_dev points to a pci device(RP or switch DSP) rather than a pci host bridge device. Fixes: f05fd10d138d ("cxl/pci: Add RCH downstream port AER register discovery") Signed-off-by: Li Ming Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Tested-by: Ira Weiny Tested-by: Alison Schofield Link: https://patch.msgid.link/20240809082750.3015641-2-ming4.li@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index a663e7566c48..51132a575b27 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -834,11 +834,13 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport) void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) { struct device *dport_dev = dport->dport_dev; - struct pci_host_bridge *host_bridge; - host_bridge = to_pci_host_bridge(dport_dev); - if (host_bridge->native_aer) - dport->rcrb.aer_cap = cxl_rcrb_to_aer(dport_dev, dport->rcrb.base); + if (dport->rch) { + struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev); + + if (host_bridge->native_aer) + dport->rcrb.aer_cap = cxl_rcrb_to_aer(dport_dev, dport->rcrb.base); + } dport->reg_map.host = host; cxl_dport_map_regs(dport); From 2c402bd2e85b44dc00ef85b5c0e217de684b5372 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 9 Aug 2024 08:27:50 +0000 Subject: [PATCH 0713/1052] cxl/test: Skip cxl_setup_parent_dport() for emulated dports The cxl_test unit test environment on qemu always hits below call trace with KASAN enabled: BUG: KASAN: slab-out-of-bounds in cxl_setup_parent_dport+0x480/0x530 [cxl_core] Read of size 1 at addr ff110000676014f8 by task (udev-worker)/676[ 24.424403] CPU: 2 PID: 676 Comm: (udev-worker) Tainted: G O N 6.10.0-qemucxl #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20240214-2.el9 02/14/2024 Call Trace: dump_stack_lvl+0xea/0x150 print_report+0xce/0x610 ? kasan_complete_mode_report_info+0x40/0x200 kasan_report+0xcc/0x110 __asan_report_load1_noabort+0x18/0x20 cxl_setup_parent_dport+0x480/0x530 [cxl_core] cxl_mem_probe+0x49b/0xaa0 [cxl_mem] cxl_test module models a CXL topology for testing, it creates some emulated dports with platform devices in the CXL topology, so the dport_dev of an emulated dport points to a platform device rather than a pci device or a pci host bridge in the case. Currently, cxl_setup_parent_dport() is used to set up RAS and AER capability on the dport connected to the CXL memory device, but cxl_test does not support RAS or AER functionality yet, so the fix is implementing a __wrap_cxl_setup_parent_dport() to filter out all emulated dports, guarantees only real dports can be handled by cxl_setup_parent_dport(). Fixes: f05fd10d138d ("cxl/pci: Add RCH downstream port AER register discovery") Reported-by: Pengfei Xu Closes: https://lore.kernel.org/linux-cxl/ZrHTBp2O+HtUe6kt@xpf.sh.intel.com/T/#t Signed-off-by: Li Ming Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Tested-by: Ira Weiny Tested-by: Alison Schofield Link: https://patch.msgid.link/20240809082750.3015641-3-ming4.li@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/Kbuild | 1 + tools/testing/cxl/test/mock.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 030b388800f0..3d1ca9e38b1f 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -14,6 +14,7 @@ ldflags-y += --wrap=cxl_dvsec_rr_decode ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat +ldflags-y += --wrap=cxl_setup_parent_dport DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 6f737941dc0e..d619672faa49 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -299,6 +299,18 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL); +void __wrap_cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) +{ + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (!ops || !ops->is_mock_port(dport->dport_dev)) + cxl_setup_parent_dport(host, dport); + + put_cxl_mock_ops(index); +} +EXPORT_SYMBOL_NS_GPL(__wrap_cxl_setup_parent_dport, CXL); + MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); MODULE_IMPORT_NS(CXL); From 8a2491db7bea6ad88ec568731eafd583501f1c96 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 Aug 2024 00:25:25 -0400 Subject: [PATCH 0714/1052] bcachefs: bcachefs_metadata_version_disk_accounting_v3 bcachefs_metadata_version_disk_accounting_v2 erroneously had padding bytes in disk_accounting_key, which is a problem because we have to guarantee that all unused bytes in disk_accounting_key are zeroed. Fortunately 6.11 isn't out yet, so it's cheap to fix this by spinning a new version. Reported-by: Gabriel de Perthuis Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/disk_accounting_format.h | 8 ++++---- fs/bcachefs/sb-downgrade.c | 27 ++++++++++++++++++++++++++- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index ad893684db52..b25f86356728 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -675,7 +675,8 @@ struct bch_sb_field_ext { x(btree_subvolume_children, BCH_VERSION(1, 6)) \ x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ - x(disk_accounting_v2, BCH_VERSION(1, 9)) + x(disk_accounting_v2, BCH_VERSION(1, 9)) \ + x(disk_accounting_v3, BCH_VERSION(1, 10)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 848f06cc809d..a93cf26ff4a9 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -130,11 +130,11 @@ struct bch_acct_compression { struct bch_acct_snapshot { __u32 id; -}; +} __packed; struct bch_acct_btree { __u32 id; -}; +} __packed; struct bch_acct_rebalance_work { }; @@ -152,8 +152,8 @@ struct disk_accounting_pos { struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; struct bch_acct_rebalance_work rebalance_work; - }; - }; + } __packed; + } __packed; struct bpos _pad; }; }; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index dfbbd33c8731..6c4469f53313 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -61,12 +61,37 @@ BCH_FSCK_ERR_dev_usage_buckets_wrong, \ BCH_FSCK_ERR_dev_usage_sectors_wrong, \ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(disk_accounting_v3, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_bkey_version_in_future, \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ + BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ 0) \ x(disk_accounting_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_fs_usage_hidden_wrong, \ + BCH_FSCK_ERR_fs_usage_btree_wrong, \ + BCH_FSCK_ERR_fs_usage_data_wrong, \ + BCH_FSCK_ERR_fs_usage_cached_wrong, \ + BCH_FSCK_ERR_fs_usage_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ + BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_bkey_version_in_future) \ + x(disk_accounting_v3, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_dev_usage_buckets_wrong, \ BCH_FSCK_ERR_dev_usage_sectors_wrong, \ From 4bbe6002931954bbe82b25f25990b987b0392e18 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 18 Jul 2024 16:38:07 -0300 Subject: [PATCH 0715/1052] perf daemon: Fix the build on 32-bit architectures Noticed with: 1 6.22 debian:experimental-x-mipsel : FAIL gcc version 13.2.0 (Debian 13.2.0-25) builtin-daemon.c: In function 'cmd_session_list': builtin-daemon.c:691:35: error: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'time_t' {aka 'long long int'} [-Werror=format=] Use inttypes.h's PRIu64 to deal with that. Signed-off-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/ZplvH21aQ8pzmza_@x1 Signed-off-by: Namhyung Kim --- tools/perf/builtin-daemon.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index de76bbc50bfb..5c9335fff2d3 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -688,7 +689,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) /* lock */ csv_sep, daemon->base, "lock"); - fprintf(out, "%c%lu", + fprintf(out, "%c%" PRIu64, /* session up time */ csv_sep, (curr - daemon->start) / 60); @@ -700,7 +701,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) daemon->base, SESSION_OUTPUT); fprintf(out, " lock: %s/lock\n", daemon->base); - fprintf(out, " up: %lu minutes\n", + fprintf(out, " up: %" PRIu64 " minutes\n", (curr - daemon->start) / 60); } } @@ -727,7 +728,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) /* session ack */ csv_sep, session->base, SESSION_ACK); - fprintf(out, "%c%lu", + fprintf(out, "%c%" PRIu64, /* session up time */ csv_sep, (curr - session->start) / 60); @@ -745,7 +746,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) session->base, SESSION_CONTROL); fprintf(out, " ack: %s/%s\n", session->base, SESSION_ACK); - fprintf(out, " up: %lu minutes\n", + fprintf(out, " up: %" PRIu64 " minutes\n", (curr - session->start) / 60); } } From 3eb3cd5992f7a0c37edc8d05b4c38c98758d8671 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Aug 2024 12:51:23 -0700 Subject: [PATCH 0716/1052] binfmt_flat: Fix corruption when not offsetting data start Commit 04d82a6d0881 ("binfmt_flat: allow not offsetting data start") introduced a RISC-V specific variant of the FLAT format which does not allocate any space for the (obsolete) array of shared library pointers. However, it did not disable the code which initializes the array, resulting in the corruption of sizeof(long) bytes before the DATA segment, generally the end of the TEXT segment. Introduce MAX_SHARED_LIBS_UPDATE which depends on the state of CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET to guard the initialization of the shared library pointer region so that it will only be initialized if space is reserved for it. Fixes: 04d82a6d0881 ("binfmt_flat: allow not offsetting data start") Co-developed-by: Stefan O'Rear Signed-off-by: Stefan O'Rear Reviewed-by: Damien Le Moal Acked-by: Greg Ungerer Link: https://lore.kernel.org/r/20240807195119.it.782-kees@kernel.org Signed-off-by: Kees Cook --- fs/binfmt_flat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index c26545d71d39..cd6d5bbb4b9d 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -72,8 +72,10 @@ #ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET #define DATA_START_OFFSET_WORDS (0) +#define MAX_SHARED_LIBS_UPDATE (0) #else #define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#define MAX_SHARED_LIBS_UPDATE (MAX_SHARED_LIBS) #endif struct lib_info { @@ -880,7 +882,7 @@ static int load_flat_binary(struct linux_binprm *bprm) return res; /* Update data segment pointers for all libraries */ - for (i = 0; i < MAX_SHARED_LIBS; i++) { + for (i = 0; i < MAX_SHARED_LIBS_UPDATE; i++) { if (!libinfo.lib_list[i].loaded) continue; for (j = 0; j < MAX_SHARED_LIBS; j++) { From 3a3be7ff9224f424e485287b54be00d2c6bd9c40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2024 13:24:55 +0000 Subject: [PATCH 0717/1052] gtp: pull network headers in gtp_dev_xmit() syzbot/KMSAN reported use of uninit-value in get_dev_xmit() [1] We must make sure the IPv4 or Ipv6 header is pulled in skb->head before accessing fields in them. Use pskb_inet_may_pull() to fix this issue. [1] BUG: KMSAN: uninit-value in ipv6_pdp_find drivers/net/gtp.c:220 [inline] BUG: KMSAN: uninit-value in gtp_build_skb_ip6 drivers/net/gtp.c:1229 [inline] BUG: KMSAN: uninit-value in gtp_dev_xmit+0x1424/0x2540 drivers/net/gtp.c:1281 ipv6_pdp_find drivers/net/gtp.c:220 [inline] gtp_build_skb_ip6 drivers/net/gtp.c:1229 [inline] gtp_dev_xmit+0x1424/0x2540 drivers/net/gtp.c:1281 __netdev_start_xmit include/linux/netdevice.h:4913 [inline] netdev_start_xmit include/linux/netdevice.h:4922 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3596 __dev_queue_xmit+0x358c/0x5610 net/core/dev.c:4423 dev_queue_xmit include/linux/netdevice.h:3105 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3145 [inline] packet_sendmsg+0x90e3/0xa3a0 net/packet/af_packet.c:3177 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2204 __do_sys_sendto net/socket.c:2216 [inline] __se_sys_sendto net/socket.c:2212 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2212 x64_sys_call+0x3799/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:45 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:3994 [inline] slab_alloc_node mm/slub.c:4037 [inline] kmem_cache_alloc_node_noprof+0x6bf/0xb80 mm/slub.c:4080 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:583 __alloc_skb+0x363/0x7b0 net/core/skbuff.c:674 alloc_skb include/linux/skbuff.h:1320 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6526 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2815 packet_alloc_skb net/packet/af_packet.c:2994 [inline] packet_snd net/packet/af_packet.c:3088 [inline] packet_sendmsg+0x749c/0xa3a0 net/packet/af_packet.c:3177 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2204 __do_sys_sendto net/socket.c:2216 [inline] __se_sys_sendto net/socket.c:2212 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2212 x64_sys_call+0x3799/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:45 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f CPU: 0 UID: 0 PID: 7115 Comm: syz.1.515 Not tainted 6.11.0-rc1-syzkaller-00043-g94ede2a3e913 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024 Fixes: 999cb275c807 ("gtp: add IPv6 support") Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Signed-off-by: Eric Dumazet Cc: Harald Welte Reviewed-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20240808132455.3413916-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 427b91aca50d..0696faf60013 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1269,6 +1269,9 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (skb_cow_head(skb, dev->needed_headroom)) goto tx_err; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + skb_reset_inner_headers(skb); /* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */ From 2b2bc3bab158b7e036508742b16cd8a3c2f59a12 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 8 Aug 2024 11:56:21 +0200 Subject: [PATCH 0718/1052] net: Make USO depend on CSUM offload UDP segmentation offload inherently depends on checksum offload. It should not be possible to disable checksum offload while leaving USO enabled. Enforce this dependency in code. There is a single tx-udp-segmentation feature flag to indicate support for both IPv4/6, hence the devices wishing to support USO must offer checksum offload for both IP versions. Fixes: 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") Suggested-by: Willem de Bruijn Signed-off-by: Jakub Sitnicki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240808-udp-gso-egress-from-tunnel-v4-1-f5c5b4149ab9@cloudflare.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 751d9b70e6ad..f66e61407883 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9912,6 +9912,15 @@ static void netdev_sync_lower_features(struct net_device *upper, } } +static bool netdev_has_ip_or_hw_csum(netdev_features_t features) +{ + netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; + bool hw_csum = features & NETIF_F_HW_CSUM; + + return ip_csum || hw_csum; +} + static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { @@ -9993,15 +10002,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_LRO; } - if (features & NETIF_F_HW_TLS_TX) { - bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - bool hw_csum = features & NETIF_F_HW_CSUM; - - if (!ip_csum && !hw_csum) { - netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); - features &= ~NETIF_F_HW_TLS_TX; - } + if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { @@ -10009,6 +10012,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_HW_TLS_RX; } + if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); + features &= ~NETIF_F_GSO_UDP_L4; + } + return features; } From 30b03f2a0592eee1267298298eac9dd655f55ab2 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 8 Aug 2024 11:56:22 +0200 Subject: [PATCH 0719/1052] udp: Fall back to software USO if IPv6 extension headers are present In commit 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") we have intentionally allowed UDP GSO packets marked CHECKSUM_NONE to pass to the GSO stack, so that they can be segmented and checksummed by a software fallback when the egress device lacks these features. What was not taken into consideration is that a CHECKSUM_NONE skb can be handed over to the GSO stack also when the egress device advertises the tx-udp-segmentation / NETIF_F_GSO_UDP_L4 feature. This will happen when there are IPv6 extension headers present, which we check for in __ip6_append_data(). Syzbot has discovered this scenario, producing a warning as below: ip6tnl0: caps=(0x00000006401d7869, 0x00000006401d7869) WARNING: CPU: 0 PID: 5112 at net/core/dev.c:3293 skb_warn_bad_offload+0x166/0x1a0 net/core/dev.c:3291 Modules linked in: CPU: 0 PID: 5112 Comm: syz-executor391 Not tainted 6.10.0-rc7-syzkaller-01603-g80ab5445da62 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/07/2024 RIP: 0010:skb_warn_bad_offload+0x166/0x1a0 net/core/dev.c:3291 [...] Call Trace: __skb_gso_segment+0x3be/0x4c0 net/core/gso.c:127 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x585/0x1120 net/core/dev.c:3661 __dev_queue_xmit+0x17a4/0x3e90 net/core/dev.c:4415 neigh_output include/net/neighbour.h:542 [inline] ip6_finish_output2+0xffa/0x1680 net/ipv6/ip6_output.c:137 ip6_finish_output+0x41e/0x810 net/ipv6/ip6_output.c:222 ip6_send_skb+0x112/0x230 net/ipv6/ip6_output.c:1958 udp_v6_send_skb+0xbf5/0x1870 net/ipv6/udp.c:1292 udpv6_sendmsg+0x23b3/0x3270 net/ipv6/udp.c:1588 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xef/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmmsg+0x3b2/0x740 net/socket.c:2725 __do_sys_sendmmsg net/socket.c:2754 [inline] __se_sys_sendmmsg net/socket.c:2751 [inline] __x64_sys_sendmmsg+0xa0/0xb0 net/socket.c:2751 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f [...] We are hitting the bad offload warning because when an egress device is capable of handling segmentation offload requested by skb_shinfo(skb)->gso_type, the chain of gso_segment callbacks won't produce any segment skbs and return NULL. See the skb_gso_ok() branch in {__udp,tcp,sctp}_gso_segment helpers. To fix it, force a fallback to software USO when processing a packet with IPv6 extension headers, since we don't know if these can checksummed by all devices which offer USO. Fixes: 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") Reported-by: syzbot+e15b7e15b8a751a91d9a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000e1609a061d5330ce@google.com/ Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Sitnicki Link: https://patch.msgid.link/20240808-udp-gso-egress-from-tunnel-v4-2-f5c5b4149ab9@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index bc8a9da750fe..b254a5dadfcf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -282,6 +282,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, skb_transport_header(gso_skb))) return ERR_PTR(-EINVAL); + /* We don't know if egress device can segment and checksum the packet + * when IPv6 extension headers are present. Fall back to software GSO. + */ + if (gso_skb->ip_summed != CHECKSUM_PARTIAL) + features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK); + if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), From 1d2c46c1bc5680335f20f64089c161fdfcd3e8ab Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 8 Aug 2024 11:56:23 +0200 Subject: [PATCH 0720/1052] selftests/net: Add coverage for UDP GSO with IPv6 extension headers After enabling UDP GSO for devices not offering checksum offload, we have hit a regression where a bad offload warning can be triggered when sending a datagram with IPv6 extension headers. Extend the UDP GSO IPv6 tests to cover this scenario. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Sitnicki Link: https://patch.msgid.link/20240808-udp-gso-egress-from-tunnel-v4-3-f5c5b4149ab9@cloudflare.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/udpgso.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 3e74cfa1a2bf..3f2fca02fec5 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -67,6 +67,7 @@ struct testcase { int gso_len; /* mss after applying gso */ int r_num_mss; /* recv(): number of calls of full mss */ int r_len_last; /* recv(): size of last non-mss dgram, if any */ + bool v6_ext_hdr; /* send() dgrams with IPv6 extension headers */ }; const struct in6_addr addr6 = { @@ -77,6 +78,8 @@ const struct in_addr addr4 = { __constant_htonl(0x0a000001), /* 10.0.0.1 */ }; +static const char ipv6_hopopts_pad1[8] = { 0 }; + struct testcase testcases_v4[] = { { /* no GSO: send a single byte */ @@ -255,6 +258,13 @@ struct testcase testcases_v6[] = { .gso_len = 1, .r_num_mss = 2, }, + { + /* send 2 1B segments with extension headers */ + .tlen = 2, + .gso_len = 1, + .r_num_mss = 2, + .v6_ext_hdr = true, + }, { /* send 2B + 2B + 1B segments */ .tlen = 5, @@ -396,11 +406,18 @@ static void run_one(struct testcase *test, int fdt, int fdr, int i, ret, val, mss; bool sent; - fprintf(stderr, "ipv%d tx:%d gso:%d %s\n", + fprintf(stderr, "ipv%d tx:%d gso:%d %s%s\n", addr->sa_family == AF_INET ? 4 : 6, test->tlen, test->gso_len, + test->v6_ext_hdr ? "ext-hdr " : "", test->tfail ? "(fail)" : ""); + if (test->v6_ext_hdr) { + if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, + ipv6_hopopts_pad1, sizeof(ipv6_hopopts_pad1))) + error(1, errno, "setsockopt ipv6 hopopts"); + } + val = test->gso_len; if (cfg_do_setsockopt) { if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val))) @@ -412,6 +429,12 @@ static void run_one(struct testcase *test, int fdt, int fdr, error(1, 0, "send succeeded while expecting failure"); if (!sent && !test->tfail) error(1, 0, "send failed while expecting success"); + + if (test->v6_ext_hdr) { + if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, NULL, 0)) + error(1, errno, "setsockopt ipv6 hopopts clear"); + } + if (!sent) return; From c31fe2b5095d8c84562ce90db07600f7e9f318df Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 8 Aug 2024 17:41:02 +0300 Subject: [PATCH 0721/1052] net/mlx5: SD, Do not query MPIR register if no sd_group Unconditionally calling the MPIR query on BF separate mode yields the FW syndrome below [1]. Do not call it unless admin clearly specified the SD group, i.e. expressing the intention of using the multi-PF netdev feature. This fix covers cases not covered in commit fca3b4791850 ("net/mlx5: Do not query MPIR on embedded CPU function"). [1] mlx5_cmd_out_err:808:(pid 8267): ACCESS_REG(0x805) op_mod(0x1) failed, status bad system state(0x4), syndrome (0x685f19), err(-5) Fixes: 678eb448055a ("net/mlx5: SD, Implement basic query and instantiation") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://patch.msgid.link/20240808144107.2095424-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index f6deb5a3f820..eeb0b7ea05f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -126,7 +126,7 @@ static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) } static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm, - u8 *host_buses, u8 *sd_group) + u8 *host_buses) { u32 out[MLX5_ST_SZ_DW(mpir_reg)]; int err; @@ -135,10 +135,6 @@ static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm, if (err) return err; - err = mlx5_query_nic_vport_sd_group(dev, sd_group); - if (err) - return err; - *sdm = MLX5_GET(mpir_reg, out, sdm); *host_buses = MLX5_GET(mpir_reg, out, host_buses); @@ -166,19 +162,23 @@ static int sd_init(struct mlx5_core_dev *dev) if (mlx5_core_is_ecpf(dev)) return 0; + err = mlx5_query_nic_vport_sd_group(dev, &sd_group); + if (err) + return err; + + if (!sd_group) + return 0; + if (!MLX5_CAP_MCAM_REG(dev, mpir)) return 0; - err = mlx5_query_sd(dev, &sdm, &host_buses, &sd_group); + err = mlx5_query_sd(dev, &sdm, &host_buses); if (err) return err; if (!sdm) return 0; - if (!sd_group) - return 0; - group_id = mlx5_sd_group_id(dev, sd_group); if (!mlx5_sd_is_supported(dev, host_buses)) { From ab6013a59b4d0947fda409c29426dc904959e632 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 8 Aug 2024 17:41:03 +0300 Subject: [PATCH 0722/1052] net/mlx5e: SHAMPO, Increase timeout to improve latency During latency tests (netperf TCP_RR) a 30% degradation of HW GRO vs SW GRO was observed. This is due to SHAMPO triggering timeout filler CQEs instead of delivering the CQE for the packet. Having a short timeout for SHAMPO doesn't bring any benefits as it is the driver that does the merging, not the hardware. On the contrary, it can have a negative impact: additional filler CQEs are generated due to the timeout. As there is no way to disable this timeout, this change sets it to the maximum value. Instead of using the packet_merge.timeout parameter which is also used for LRO, set the value directly when filling in the rest of the SHAMPO parameters in mlx5e_build_rq_param(). Fixes: 99be56171fa9 ("net/mlx5e: SHAMPO, Re-enable HW-GRO") Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240808144107.2095424-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en/params.c | 16 +++++++++++++++- .../net/ethernet/mellanox/mlx5/core/en/params.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 12 ------------ 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 5fd82c67b6ab..bb5da42edc23 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -130,7 +130,7 @@ struct page_pool; #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW 0x2 #define MLX5E_DEFAULT_LRO_TIMEOUT 32 -#define MLX5E_LRO_TIMEOUT_ARR_SIZE 4 +#define MLX5E_DEFAULT_SHAMPO_TIMEOUT 1024 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 6c9ccccca81e..64b62ed17b07 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -928,7 +928,7 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, MLX5_SET(wq, wq, log_headers_entry_size, mlx5e_shampo_get_log_hd_entry_size(mdev, params)); MLX5_SET(rqc, rqc, reservation_timeout, - params->packet_merge.timeout); + mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_SHAMPO_TIMEOUT)); MLX5_SET(rqc, rqc, shampo_match_criteria_type, params->packet_merge.shampo.match_criteria_type); MLX5_SET(rqc, rqc, shampo_no_match_alignment_granularity, @@ -1087,6 +1087,20 @@ static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev, return wqebbs; } +#define MLX5E_LRO_TIMEOUT_ARR_SIZE 4 + +u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) +{ + int i; + + /* The supported periods are organized in ascending order */ + for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE - 1; i++) + if (MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]) >= wanted_timeout) + break; + + return MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]); +} + static u32 mlx5e_mpwrq_total_umr_wqebbs(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index 749b2ec0436e..3f8986f9d862 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -108,6 +108,7 @@ u32 mlx5e_shampo_hd_per_wqe(struct mlx5_core_dev *mdev, u32 mlx5e_shampo_hd_per_wq(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rq_param); +u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout); u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6f686fabed44..f04decca39f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5167,18 +5167,6 @@ const struct net_device_ops mlx5e_netdev_ops = { #endif }; -static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) -{ - int i; - - /* The supported periods are organized in ascending order */ - for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE - 1; i++) - if (MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]) >= wanted_timeout) - break; - - return MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]); -} - void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu) { struct mlx5e_params *params = &priv->channels.params; From e6b5afd30b99b43682a7764e1a74a42fe4d5f4b3 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 8 Aug 2024 17:41:04 +0300 Subject: [PATCH 0723/1052] net/mlx5e: Take state lock during tx timeout reporter mlx5e_safe_reopen_channels() requires the state lock taken. The referenced changed in the Fixes tag removed the lock to fix another issue. This patch adds it back but at a later point (when calling mlx5e_safe_reopen_channels()) to avoid the deadlock referenced in the Fixes tag. Fixes: eab0da38912e ("net/mlx5e: Fix possible deadlock on mlx5e_tx_timeout_work") Signed-off-by: Dragos Tatulea Link: https://lore.kernel.org/all/ZplpKq8FKi3vwfxv@gmail.com/T/ Reviewed-by: Breno Leitao Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240808144107.2095424-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 22918b2ef7f1..09433b91be17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -146,7 +146,9 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) return err; } + mutex_lock(&priv->state_lock); err = mlx5e_safe_reopen_channels(priv); + mutex_unlock(&priv->state_lock); if (!err) { to_ctx->status = 1; /* all channels recovered */ return err; From cbc796be1779c4dbc9a482c7233995e2a8b6bfb3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 8 Aug 2024 17:41:05 +0300 Subject: [PATCH 0724/1052] net/mlx5e: Correctly report errors for ethtool rx flows Previously, an ethtool rx flow with no attrs would not be added to the NIC as it has no rules to configure the hw with, but it would be reported as successful to the caller (return code 0). This is confusing for the user as ethtool then reports "Added rule $num", but no rule was actually added. This change corrects that by instead reporting these wrong rules as -EINVAL. Fixes: b29c61dac3a2 ("net/mlx5e: Ethtool steering flow validation refactoring") Signed-off-by: Cosmin Ratiu Reviewed-by: Saeed Mahameed Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240808144107.2095424-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index 3eccdadc0357..773624bb2c5d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -734,7 +734,7 @@ mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv, if (num_tuples <= 0) { netdev_warn(priv->netdev, "%s: flow is not valid %d\n", __func__, num_tuples); - return num_tuples; + return num_tuples < 0 ? num_tuples : -EINVAL; } eth_ft = get_flow_table(priv, fs, num_tuples); From 0b4a4534d083e055831b3bc29c5eafc918ed4d86 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 8 Aug 2024 17:41:06 +0300 Subject: [PATCH 0725/1052] net/mlx5e: Fix queue stats access to non-existing channels splat The queue stats API queries the queues according to the real_num_[tr]x_queues, in case the device is down and channels were not yet created, don't try to query their statistics. To trigger the panic, run this command before the interface is brought up: ./cli.py --spec ../../../Documentation/netlink/specs/netdev.yaml --dump qstats-get --json '{"ifindex": 4}' BUG: kernel NULL pointer dereference, address: 0000000000000c00 PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 3 UID: 0 PID: 977 Comm: python3 Not tainted 6.10.0+ #40 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5e_get_queue_stats_rx+0x3c/0xb0 [mlx5_core] Code: fc 55 48 63 ee 53 48 89 d3 e8 40 3d 70 e1 85 c0 74 58 4c 89 ef e8 d4 07 04 00 84 c0 75 41 49 8b 84 24 f8 39 00 00 48 8b 04 e8 <48> 8b 90 00 0c 00 00 48 03 90 40 0a 00 00 48 89 53 08 48 8b 90 08 RSP: 0018:ffff888116be37d0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888116be3868 RCX: 0000000000000004 RDX: ffff88810ada4000 RSI: 0000000000000000 RDI: ffff888109df09c0 RBP: 0000000000000000 R08: 0000000000000004 R09: 0000000000000004 R10: ffff88813461901c R11: ffffffffffffffff R12: ffff888109df0000 R13: ffff888109df09c0 R14: ffff888116be38d0 R15: 0000000000000000 FS: 00007f4375d5c740(0000) GS:ffff88852c980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000c00 CR3: 0000000106ada006 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __die+0x1f/0x60 ? page_fault_oops+0x14e/0x3d0 ? exc_page_fault+0x73/0x130 ? asm_exc_page_fault+0x22/0x30 ? mlx5e_get_queue_stats_rx+0x3c/0xb0 [mlx5_core] netdev_nl_stats_by_netdev+0x2a6/0x4c0 ? __rmqueue_pcplist+0x351/0x6f0 netdev_nl_qstats_get_dumpit+0xc4/0x1b0 genl_dumpit+0x2d/0x80 netlink_dump+0x199/0x410 __netlink_dump_start+0x1aa/0x2c0 genl_family_rcv_msg_dumpit+0x94/0xf0 ? __pfx_genl_start+0x10/0x10 ? __pfx_genl_dumpit+0x10/0x10 ? __pfx_genl_done+0x10/0x10 genl_rcv_msg+0x116/0x2b0 ? __pfx_netdev_nl_qstats_get_dumpit+0x10/0x10 ? __pfx_genl_rcv_msg+0x10/0x10 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x21a/0x340 netlink_sendmsg+0x1f4/0x440 __sys_sendto+0x1b6/0x1c0 ? do_sock_setsockopt+0xc3/0x180 ? __sys_setsockopt+0x60/0xb0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x50/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f43757132b0 Code: c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 1d 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 68 c3 0f 1f 80 00 00 00 00 41 54 48 83 ec 20 RSP: 002b:00007ffd258da048 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007ffd258da0f8 RCX: 00007f43757132b0 RDX: 000000000000001c RSI: 00007f437464b850 RDI: 0000000000000003 RBP: 00007f4375085de0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007f43751a6147 Modules linked in: netconsole xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core zram zsmalloc mlx5_core fuse [last unloaded: netconsole] CR2: 0000000000000c00 ---[ end trace 0000000000000000 ]--- RIP: 0010:mlx5e_get_queue_stats_rx+0x3c/0xb0 [mlx5_core] Code: fc 55 48 63 ee 53 48 89 d3 e8 40 3d 70 e1 85 c0 74 58 4c 89 ef e8 d4 07 04 00 84 c0 75 41 49 8b 84 24 f8 39 00 00 48 8b 04 e8 <48> 8b 90 00 0c 00 00 48 03 90 40 0a 00 00 48 89 53 08 48 8b 90 08 RSP: 0018:ffff888116be37d0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888116be3868 RCX: 0000000000000004 RDX: ffff88810ada4000 RSI: 0000000000000000 RDI: ffff888109df09c0 RBP: 0000000000000000 R08: 0000000000000004 R09: 0000000000000004 R10: ffff88813461901c R11: ffffffffffffffff R12: ffff888109df0000 R13: ffff888109df09c0 R14: ffff888116be38d0 R15: 0000000000000000 FS: 00007f4375d5c740(0000) GS:ffff88852c980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000c00 CR3: 0000000106ada006 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 7b66ae536a78 ("net/mlx5e: Add per queue netdev-genl stats") Signed-off-by: Gal Pressman Signed-off-by: Tariq Toukan Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240808144107.2095424-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index f04decca39f2..5df904639b0c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5296,7 +5296,7 @@ static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i, struct mlx5e_rq_stats *rq_stats; ASSERT_RTNL(); - if (mlx5e_is_uplink_rep(priv)) + if (mlx5e_is_uplink_rep(priv) || !priv->stats_nch) return; channel_stats = priv->channel_stats[i]; @@ -5316,6 +5316,9 @@ static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i, struct mlx5e_sq_stats *sq_stats; ASSERT_RTNL(); + if (!priv->stats_nch) + return; + /* no special case needed for ptp htb etc since txq2sq_stats is kept up * to date for active sq_stats, otherwise get_base_stats takes care of * inactive sqs. From d73f0f49daa84176c3beee1606e73c7ffb6af8b2 Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Fri, 9 Aug 2024 12:32:24 +0530 Subject: [PATCH 0726/1052] irqchip/xilinx: Fix shift out of bounds The device tree property 'xlnx,kind-of-intr' is sanity checked that the bitmask contains only set bits which are in the range of the number of interrupts supported by the controller. The check is done by shifting the mask right by the number of supported interrupts and checking the result for zero. The data type of the mask is u32 and the number of supported interrupts is up to 32. In case of 32 interrupts the shift is out of bounds, resulting in a mismatch warning. The out of bounds condition is also reported by UBSAN: UBSAN: shift-out-of-bounds in irq-xilinx-intc.c:332:22 shift exponent 32 is too large for 32-bit type 'unsigned int' Fix it by promoting the mask to u64 for the test. Fixes: d50466c90724 ("microblaze: intc: Refactor DT sanity check") Signed-off-by: Radhey Shyam Pandey Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/1723186944-3571957-1-git-send-email-radhey.shyam.pandey@amd.com --- drivers/irqchip/irq-xilinx-intc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-xilinx-intc.c b/drivers/irqchip/irq-xilinx-intc.c index 238d3d344949..7e08714d507f 100644 --- a/drivers/irqchip/irq-xilinx-intc.c +++ b/drivers/irqchip/irq-xilinx-intc.c @@ -189,7 +189,7 @@ static int __init xilinx_intc_of_init(struct device_node *intc, irqc->intr_mask = 0; } - if (irqc->intr_mask >> irqc->nr_irq) + if ((u64)irqc->intr_mask >> irqc->nr_irq) pr_warn("irq-xilinx: mismatch in kind-of-intr param\n"); pr_info("irq-xilinx: %pOF: num_irq=%d, edge=0x%x\n", From 03f9885c60adf73488fe32aab628ee3d4a39598e Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 9 Aug 2024 15:10:47 +0800 Subject: [PATCH 0727/1052] irqchip/riscv-aplic: Retrigger MSI interrupt on source configuration The section 4.5.2 of the RISC-V AIA specification says that "any write to a sourcecfg register of an APLIC might (or might not) cause the corresponding interrupt-pending bit to be set to one if the rectified input value is high (= 1) under the new source mode." When the interrupt type is changed in the sourcecfg register, the APLIC device might not set the corresponding pending bit, so the interrupt might never become pending. To handle sourcecfg register changes for level-triggered interrupts in MSI mode, manually set the pending bit for retriggering interrupt so it gets retriggered if it was already asserted. Fixes: ca8df97fe679 ("irqchip/riscv-aplic: Add support for MSI-mode") Signed-off-by: Yong-Xuan Wang Signed-off-by: Thomas Gleixner Reviewed-by: Vincent Chen Reviewed-by: Anup Patel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240809071049.2454-1-yongxuan.wang@sifive.com --- drivers/irqchip/irq-riscv-aplic-msi.c | 32 +++++++++++++++++++++------ 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/irqchip/irq-riscv-aplic-msi.c b/drivers/irqchip/irq-riscv-aplic-msi.c index 028444af48bd..d7773f76e5d0 100644 --- a/drivers/irqchip/irq-riscv-aplic-msi.c +++ b/drivers/irqchip/irq-riscv-aplic-msi.c @@ -32,15 +32,10 @@ static void aplic_msi_irq_unmask(struct irq_data *d) aplic_irq_unmask(d); } -static void aplic_msi_irq_eoi(struct irq_data *d) +static void aplic_msi_irq_retrigger_level(struct irq_data *d) { struct aplic_priv *priv = irq_data_get_irq_chip_data(d); - /* - * EOI handling is required only for level-triggered interrupts - * when APLIC is in MSI mode. - */ - switch (irqd_get_trigger_type(d)) { case IRQ_TYPE_LEVEL_LOW: case IRQ_TYPE_LEVEL_HIGH: @@ -59,6 +54,29 @@ static void aplic_msi_irq_eoi(struct irq_data *d) } } +static void aplic_msi_irq_eoi(struct irq_data *d) +{ + /* + * EOI handling is required only for level-triggered interrupts + * when APLIC is in MSI mode. + */ + aplic_msi_irq_retrigger_level(d); +} + +static int aplic_msi_irq_set_type(struct irq_data *d, unsigned int type) +{ + int rc = aplic_irq_set_type(d, type); + + if (rc) + return rc; + /* + * Updating sourcecfg register for level-triggered interrupts + * requires interrupt retriggering when APLIC is in MSI mode. + */ + aplic_msi_irq_retrigger_level(d); + return 0; +} + static void aplic_msi_write_msg(struct irq_data *d, struct msi_msg *msg) { unsigned int group_index, hart_index, guest_index, val; @@ -130,7 +148,7 @@ static const struct msi_domain_template aplic_msi_template = { .name = "APLIC-MSI", .irq_mask = aplic_msi_irq_mask, .irq_unmask = aplic_msi_irq_unmask, - .irq_set_type = aplic_irq_set_type, + .irq_set_type = aplic_msi_irq_set_type, .irq_eoi = aplic_msi_irq_eoi, #ifdef CONFIG_SMP .irq_set_affinity = irq_chip_set_affinity_parent, From ccbfcac05866ebe6eb3bc6d07b51d4ed4fcde436 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sat, 10 Aug 2024 10:48:32 +0200 Subject: [PATCH 0728/1052] ALSA: timer: Relax start tick time check for slave timer elements The recent addition of a sanity check for a too low start tick time seems breaking some applications that uses aloop with a certain slave timer setup. They may have the initial resolution 0, hence it's treated as if it were a too low value. Relax and skip the check for the slave timer instance for addressing the regression. Fixes: 4a63bd179fa8 ("ALSA: timer: Set lower bound of start tick time") Cc: Link: https://github.com/raspberrypi/linux/issues/6294 Link: https://patch.msgid.link/20240810084833.10939-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index d104adc75a8b..71a07c1662f5 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -547,7 +547,7 @@ static int snd_timer_start1(struct snd_timer_instance *timeri, /* check the actual time for the start tick; * bail out as error if it's way too low (< 100us) */ - if (start) { + if (start && !(timer->hw.flags & SNDRV_TIMER_HW_SLAVE)) { if ((u64)snd_timer_hw_resolution(timer) * ticks < 100000) return -EINVAL; } From 2ad4e1ada8eebafa2d75a4b75eeeca882de6ada1 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Sat, 3 Aug 2024 21:52:55 +0200 Subject: [PATCH 0729/1052] wifi: brcmfmac: cfg80211: Handle SSID based pmksa deletion wpa_supplicant 2.11 sends since 1efdba5fdc2c ("Handle PMKSA flush in the driver for SAE/OWE offload cases") SSID based PMKSA del commands. brcmfmac is not prepared and tries to dereference the NULL bssid and pmkid pointers in cfg80211_pmksa. PMKID_V3 operations support SSID based updates so copy the SSID. Fixes: a96202acaea4 ("wifi: brcmfmac: cfg80211: Add support for PMKID_V3 operations") Cc: stable@vger.kernel.org # 6.4.x Signed-off-by: Janne Grunau Reviewed-by: Neal Gompa Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240803-brcmfmac_pmksa_del_ssid-v1-1-4e85f19135e1@jannau.net --- .../wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 1585a5653ee4..d4cc5fa92341 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -4320,9 +4320,16 @@ brcmf_pmksa_v3_op(struct brcmf_if *ifp, struct cfg80211_pmksa *pmksa, /* Single PMK operation */ pmk_op->count = cpu_to_le16(1); length += sizeof(struct brcmf_pmksa_v3); - memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN); - memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN); - pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN; + if (pmksa->bssid) + memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN); + if (pmksa->pmkid) { + memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN); + pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN; + } + if (pmksa->ssid && pmksa->ssid_len) { + memcpy(pmk_op->pmk[0].ssid.SSID, pmksa->ssid, pmksa->ssid_len); + pmk_op->pmk[0].ssid.SSID_len = pmksa->ssid_len; + } pmk_op->pmk[0].time_left = cpu_to_le32(alive ? BRCMF_PMKSA_NO_EXPIRY : 0); } From aad41832326723627ad8ac9ee8a543b6dca4454d Mon Sep 17 00:00:00 2001 From: Asmaa Mnebhi Date: Tue, 11 Jun 2024 13:15:09 -0400 Subject: [PATCH 0730/1052] gpio: mlxbf3: Support shutdown() function During Linux graceful reboot, the GPIO interrupts are not disabled. Since the drivers are not removed during graceful reboot, the logic to call mlxbf3_gpio_irq_disable() is not triggered. Interrupts that remain enabled can cause issues on subsequent boots. For example, the mlxbf-gige driver contains PHY logic to bring up the link. If the gpio-mlxbf3 driver loads first, the mlxbf-gige driver will use a GPIO interrupt to bring up the link. Otherwise, it will use polling. The next time Linux boots and loads the drivers in this order, we encounter the issue: - mlxbf-gige loads first and uses polling while the GPIO10 interrupt is still enabled from the previous boot. So if the interrupt triggers, there is nothing to clear it. - gpio-mlxbf3 loads. - i2c-mlxbf loads. The interrupt doesn't trigger for I2C because it is shared with the GPIO interrupt line which was not cleared. The solution is to add a shutdown function to the GPIO driver to clear and disable all interrupts. Also clear the interrupt after disabling it in mlxbf3_gpio_irq_disable(). Fixes: 38a700efc510 ("gpio: mlxbf3: Add gpio driver support") Signed-off-by: Asmaa Mnebhi Reviewed-by: David Thompson Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240611171509.22151-1-asmaa@nvidia.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf3.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index d5906d419b0a..10ea71273c89 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -39,6 +39,8 @@ #define MLXBF_GPIO_CAUSE_OR_EVTEN0 0x14 #define MLXBF_GPIO_CAUSE_OR_CLRCAUSE 0x18 +#define MLXBF_GPIO_CLR_ALL_INTS GENMASK(31, 0) + struct mlxbf3_gpio_context { struct gpio_chip gc; @@ -82,6 +84,8 @@ static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); val &= ~BIT(offset); writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + + writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); gpiochip_disable_irq(gc, offset); @@ -253,6 +257,15 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) return 0; } +static void mlxbf3_gpio_shutdown(struct platform_device *pdev) +{ + struct mlxbf3_gpio_context *gs = platform_get_drvdata(pdev); + + /* Disable and clear all interrupts */ + writel(0, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + writel(MLXBF_GPIO_CLR_ALL_INTS, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); +} + static const struct acpi_device_id mlxbf3_gpio_acpi_match[] = { { "MLNXBF33", 0 }, {} @@ -265,6 +278,7 @@ static struct platform_driver mlxbf3_gpio_driver = { .acpi_match_table = mlxbf3_gpio_acpi_match, }, .probe = mlxbf3_gpio_probe, + .shutdown = mlxbf3_gpio_shutdown, }; module_platform_driver(mlxbf3_gpio_driver); From 9a039eeb71a42c8b13408a1976e300f3898e1be0 Mon Sep 17 00:00:00 2001 From: Moon Yeounsu Date: Wed, 7 Aug 2024 19:07:21 +0900 Subject: [PATCH 0731/1052] net: ethernet: use ip_hdrlen() instead of bit shift `ip_hdr(skb)->ihl << 2` is the same as `ip_hdrlen(skb)` Therefore, we should use a well-defined function not a bit shift to find the header length. It also compresses two lines to a single line. Signed-off-by: Moon Yeounsu Reviewed-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/jme.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index b06e24562973..d8be0e4dcb07 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -946,15 +946,13 @@ jme_udpsum(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IP)) return csum; skb_set_network_header(skb, ETH_HLEN); - if ((ip_hdr(skb)->protocol != IPPROTO_UDP) || - (skb->len < (ETH_HLEN + - (ip_hdr(skb)->ihl << 2) + - sizeof(struct udphdr)))) { + + if (ip_hdr(skb)->protocol != IPPROTO_UDP || + skb->len < (ETH_HLEN + ip_hdrlen(skb) + sizeof(struct udphdr))) { skb_reset_network_header(skb); return csum; } - skb_set_transport_header(skb, - ETH_HLEN + (ip_hdr(skb)->ihl << 2)); + skb_set_transport_header(skb, ETH_HLEN + ip_hdrlen(skb)); csum = udp_hdr(skb)->check; skb_reset_transport_header(skb); skb_reset_network_header(skb); From ef9718b3d54e822de294351251f3a574f8a082ce Mon Sep 17 00:00:00 2001 From: Parsa Poorshikhian Date: Sat, 10 Aug 2024 18:39:06 +0330 Subject: [PATCH 0732/1052] ALSA: hda/realtek: Fix noise from speakers on Lenovo IdeaPad 3 15IAU7 Fix noise from speakers connected to AUX port when no sound is playing. The problem occurs because the `alc_shutup_pins` function includes a 0x10ec0257 vendor ID, which causes noise on Lenovo IdeaPad 3 15IAU7 with Realtek ALC257 codec when no sound is playing. Removing this vendor ID from the function fixes the bug. Fixes: 70794b9563fe ("ALSA: hda/realtek: Add more codec ID to no shutup pins list") Signed-off-by: Parsa Poorshikhian Link: https://patch.msgid.link/20240810150939.330693-1-parsa.poorsh@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 480e82df7a4c..6e19598e23b7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -583,7 +583,6 @@ static void alc_shutup_pins(struct hda_codec *codec) switch (codec->core.vendor_id) { case 0x10ec0236: case 0x10ec0256: - case 0x10ec0257: case 0x19e58326: case 0x10ec0283: case 0x10ec0285: From 7c626ce4bae1ac14f60076d00eafe71af30450ba Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 11 Aug 2024 14:27:14 -0700 Subject: [PATCH 0733/1052] Linux 6.11-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 44c02a6f60a1..0a364e34f50b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* From 03c5c350e38d9346b69357d0e52c3c40495c14a0 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 2 Aug 2024 16:22:15 +0100 Subject: [PATCH 0734/1052] ALSA: hda/realtek: Add support for new HP G12 laptops Some of these laptop models have quirk IDs that are identical but have different amplifier parts fitted, this difference is described in the ACPI information. The solution introduced for this product family can derive the required component binding information from ACPI instead of hardcoding it, supports the new variants of the CS35L56 being used and has generalized naming that makes it applicable to other ALC+amp combinations. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20240802152215.20831-4-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 99 +++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 480e82df7a4c..24eb71efac6c 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11,15 +11,18 @@ */ #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include @@ -6856,6 +6859,86 @@ static void comp_generic_fixup(struct hda_codec *cdc, int action, const char *bu } } +static void cs35lxx_autodet_fixup(struct hda_codec *cdc, + const struct hda_fixup *fix, + int action) +{ + struct device *dev = hda_codec_dev(cdc); + struct acpi_device *adev; + struct fwnode_handle *fwnode __free(fwnode_handle) = NULL; + const char *bus = NULL; + static const struct { + const char *hid; + const char *name; + } acpi_ids[] = {{ "CSC3554", "cs35l54-hda" }, + { "CSC3556", "cs35l56-hda" }, + { "CSC3557", "cs35l57-hda" }}; + char *match; + int i, count = 0, count_devindex = 0; + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + for (i = 0; i < ARRAY_SIZE(acpi_ids); ++i) { + adev = acpi_dev_get_first_match_dev(acpi_ids[i].hid, NULL, -1); + if (adev) + break; + } + if (!adev) { + dev_err(dev, "Failed to find ACPI entry for a Cirrus Amp\n"); + return; + } + + count = i2c_acpi_client_count(adev); + if (count > 0) { + bus = "i2c"; + } else { + count = acpi_spi_count_resources(adev); + if (count > 0) + bus = "spi"; + } + + fwnode = fwnode_handle_get(acpi_fwnode_handle(adev)); + acpi_dev_put(adev); + + if (!bus) { + dev_err(dev, "Did not find any buses for %s\n", acpi_ids[i].hid); + return; + } + + if (!fwnode) { + dev_err(dev, "Could not get fwnode for %s\n", acpi_ids[i].hid); + return; + } + + /* + * When available the cirrus,dev-index property is an accurate + * count of the amps in a system and is used in preference to + * the count of bus devices that can contain additional address + * alias entries. + */ + count_devindex = fwnode_property_count_u32(fwnode, "cirrus,dev-index"); + if (count_devindex > 0) + count = count_devindex; + + match = devm_kasprintf(dev, GFP_KERNEL, "-%%s:00-%s.%%d", acpi_ids[i].name); + if (!match) + return; + dev_info(dev, "Found %d %s on %s (%s)\n", count, acpi_ids[i].hid, bus, match); + comp_generic_fixup(cdc, action, bus, acpi_ids[i].hid, match, count); + + break; + case HDA_FIXUP_ACT_FREE: + /* + * Pass the action on to comp_generic_fixup() so that + * hda_component_manager functions can be called in just once + * place. In this context the bus, hid, match_str or count + * values do not need to be calculated. + */ + comp_generic_fixup(cdc, action, NULL, NULL, NULL, 0); + break; + } +} + static void cs35l41_fixup_i2c_two(struct hda_codec *cdc, const struct hda_fixup *fix, int action) { comp_generic_fixup(cdc, action, "i2c", "CSC3551", "-%s:00-cs35l41-hda.%d", 2); @@ -7528,6 +7611,7 @@ enum { ALC256_FIXUP_CHROME_BOOK, ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7, ALC287_FIXUP_LENOVO_SSID_17AA3820, + ALCXXX_FIXUP_CS35LXX, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9857,6 +9941,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc287_fixup_lenovo_ssid_17aa3820, }, + [ALCXXX_FIXUP_CS35LXX] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35lxx_autodet_fixup, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10271,6 +10359,17 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8cdf, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ce0, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8d01, "HP ZBook Power 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d08, "HP EliteBook 1045 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d85, "HP EliteBook 1040 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d86, "HP Elite x360 1040 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8c, "HP EliteBook 830 13 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8d, "HP Elite x360 830 13 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8e, "HP EliteBook 840 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8f, "HP EliteBook 840 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d90, "HP EliteBook 860 16 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d91, "HP ZBook Firefly 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d92, "HP ZBook Firefly 16 G12", ALCXXX_FIXUP_CS35LXX), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), From 004eb8ba776ccd3e296ea6f78f7ae7985b12824e Mon Sep 17 00:00:00 2001 From: Lianqin Hu Date: Sun, 11 Aug 2024 08:30:11 +0000 Subject: [PATCH 0735/1052] ALSA: usb-audio: Add delay quirk for VIVO USB-C-XE710 HEADSET Audio control requests that sets sampling frequency sometimes fail on this card. Adding delay between control messages eliminates that problem. Signed-off-by: Lianqin Hu Cc: Link: https://patch.msgid.link/TYUPR06MB6217FF67076AF3E49E12C877D2842@TYUPR06MB6217.apcprd06.prod.outlook.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index ea063a14cdd8..e7b68c67852e 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2221,6 +2221,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x2b53, 0x0031, /* Fiero SC-01 (firmware v1.1.0) */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), + DEVICE_FLG(0x2d95, 0x8021, /* VIVO USB-C-XE710 HEADSET */ + QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x30be, 0x0101, /* Schiit Hel */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x413c, 0xa506, /* Dell AE515 sound bar */ From b86aa4140f6a8f01f35bfb05af60e01a55b48803 Mon Sep 17 00:00:00 2001 From: Bouke Sybren Haarsma Date: Sun, 28 Jul 2024 14:47:30 +0200 Subject: [PATCH 0736/1052] drm: panel-orientation-quirks: Add quirk for Ayn Loki Zero Add quirk orientation for the Ayn Loki Zero. This also has been tested/used by the JELOS team. Signed-off-by: Bouke Sybren Haarsma Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20240728124731.168452-2-boukehaarsma23@gmail.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index c16c7678237e..a1dfeaae644d 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -208,6 +208,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_MATCH(DMI_BOARD_NAME, "KUN"), }, .driver_data = (void *)&lcd1600x2560_rightside_up, + }, { /* AYN Loki Zero */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ayn"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Loki Zero"), + }, + .driver_data = (void *)&lcd1080x1920_leftside_up, }, { /* Chuwi HiBook (CWI514) */ .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hampoo"), From 2c71c8459c8ca66bd8f597effaac892ee8448a9f Mon Sep 17 00:00:00 2001 From: Bouke Sybren Haarsma Date: Sun, 28 Jul 2024 14:47:31 +0200 Subject: [PATCH 0737/1052] drm: panel-orientation-quirks: Add quirk for Ayn Loki Max Add quirk orientation for Ayn Loki Max model. This has been tested by JELOS team that uses their own patched kernel for a while now and confirmed by users in the ChimeraOS discord servers. Signed-off-by: Bouke Sybren Haarsma Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20240728124731.168452-3-boukehaarsma23@gmail.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index a1dfeaae644d..0830cae9a4d0 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -208,6 +208,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_MATCH(DMI_BOARD_NAME, "KUN"), }, .driver_data = (void *)&lcd1600x2560_rightside_up, + }, { /* AYN Loki Max */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ayn"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Loki Max"), + }, + .driver_data = (void *)&lcd1080x1920_leftside_up, }, { /* AYN Loki Zero */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ayn"), From a9a18e8f770c9b0703dab93580d0b02e199a4c79 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 9 Aug 2024 15:28:19 +0300 Subject: [PATCH 0738/1052] atm: idt77252: prevent use after free in dequeue_rx() We can't dereference "skb" after calling vcc->push() because the skb is released. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/atm/idt77252.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index e7f713cd70d3..a876024d8a05 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -1118,8 +1118,8 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) rpp->len += skb->len; if (stat & SAR_RSQE_EPDU) { + unsigned int len, truesize; unsigned char *l1l2; - unsigned int len; l1l2 = (unsigned char *) ((unsigned long) skb->data + skb->len - 6); @@ -1189,14 +1189,15 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) ATM_SKB(skb)->vcc = vcc; __net_timestamp(skb); + truesize = skb->truesize; vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); - if (skb->truesize > SAR_FB_SIZE_3) + if (truesize > SAR_FB_SIZE_3) add_rx_skb(card, 3, SAR_FB_SIZE_3, 1); - else if (skb->truesize > SAR_FB_SIZE_2) + else if (truesize > SAR_FB_SIZE_2) add_rx_skb(card, 2, SAR_FB_SIZE_2, 1); - else if (skb->truesize > SAR_FB_SIZE_1) + else if (truesize > SAR_FB_SIZE_1) add_rx_skb(card, 1, SAR_FB_SIZE_1, 1); else add_rx_skb(card, 0, SAR_FB_SIZE_0, 1); From 9ff2f816e2aa65ca9a1cdf0954842f8173c0f48d Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Fri, 9 Aug 2024 11:56:09 +0530 Subject: [PATCH 0739/1052] net: axienet: Fix register defines comment description In axiethernet header fix register defines comment description to be inline with IP documentation. It updates MAC configuration register, MDIO configuration register and frame filter control description. Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Radhey Shyam Pandey Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index fa5500decc96..c7d9221fafdc 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -160,16 +160,16 @@ #define XAE_RCW1_OFFSET 0x00000404 /* Rx Configuration Word 1 */ #define XAE_TC_OFFSET 0x00000408 /* Tx Configuration */ #define XAE_FCC_OFFSET 0x0000040C /* Flow Control Configuration */ -#define XAE_EMMC_OFFSET 0x00000410 /* EMAC mode configuration */ -#define XAE_PHYC_OFFSET 0x00000414 /* RGMII/SGMII configuration */ +#define XAE_EMMC_OFFSET 0x00000410 /* MAC speed configuration */ +#define XAE_PHYC_OFFSET 0x00000414 /* RX Max Frame Configuration */ #define XAE_ID_OFFSET 0x000004F8 /* Identification register */ -#define XAE_MDIO_MC_OFFSET 0x00000500 /* MII Management Config */ -#define XAE_MDIO_MCR_OFFSET 0x00000504 /* MII Management Control */ -#define XAE_MDIO_MWD_OFFSET 0x00000508 /* MII Management Write Data */ -#define XAE_MDIO_MRD_OFFSET 0x0000050C /* MII Management Read Data */ +#define XAE_MDIO_MC_OFFSET 0x00000500 /* MDIO Setup */ +#define XAE_MDIO_MCR_OFFSET 0x00000504 /* MDIO Control */ +#define XAE_MDIO_MWD_OFFSET 0x00000508 /* MDIO Write Data */ +#define XAE_MDIO_MRD_OFFSET 0x0000050C /* MDIO Read Data */ #define XAE_UAW0_OFFSET 0x00000700 /* Unicast address word 0 */ #define XAE_UAW1_OFFSET 0x00000704 /* Unicast address word 1 */ -#define XAE_FMI_OFFSET 0x00000708 /* Filter Mask Index */ +#define XAE_FMI_OFFSET 0x00000708 /* Frame Filter Control */ #define XAE_AF0_OFFSET 0x00000710 /* Address Filter 0 */ #define XAE_AF1_OFFSET 0x00000714 /* Address Filter 1 */ @@ -308,7 +308,7 @@ */ #define XAE_UAW1_UNICASTADDR_MASK 0x0000FFFF -/* Bit masks for Axi Ethernet FMI register */ +/* Bit masks for Axi Ethernet FMC register */ #define XAE_FMI_PM_MASK 0x80000000 /* Promis. mode enable */ #define XAE_FMI_IND_MASK 0x00000003 /* Index Mask */ From 63796bc2e97cd5ebcef60bad4953259d4ad11cb4 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:02 +0200 Subject: [PATCH 0740/1052] net: dsa: vsc73xx: fix port MAC configuration in full duplex mode According to the datasheet description ("Port Mode Procedure" in 5.6.2), the VSC73XX_MAC_CFG_WEXC_DIS bit is configured only for half duplex mode. The WEXC_DIS bit is responsible for MAC behavior after an excessive collision. Let's set it as described in the datasheet. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index d9d3e30fd47a..f548ed4cb23f 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -957,6 +957,11 @@ static void vsc73xx_mac_link_up(struct phylink_config *config, if (duplex == DUPLEX_FULL) val |= VSC73XX_MAC_CFG_FDX; + else + /* In datasheet description ("Port Mode Procedure" in 5.6.2) + * this bit is configured only for half duplex. + */ + val |= VSC73XX_MAC_CFG_WEXC_DIS; /* This routine is described in the datasheet (below ARBDISC register * description) @@ -967,7 +972,6 @@ static void vsc73xx_mac_link_up(struct phylink_config *config, get_random_bytes(&seed, 1); val |= seed << VSC73XX_MAC_CFG_SEED_OFFSET; val |= VSC73XX_MAC_CFG_SEED_LOAD; - val |= VSC73XX_MAC_CFG_WEXC_DIS; /* Those bits are responsible for MTU only. Kernel takes care about MTU, * let's enable +8 bytes frame length unconditionally. From 5b9eebc2c7a5f0cc7950d918c1e8a4ad4bed5010 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:03 +0200 Subject: [PATCH 0741/1052] net: dsa: vsc73xx: pass value in phy_write operation In the 'vsc73xx_phy_write' function, the register value is missing, and the phy write operation always sends zeros. This commit passes the value variable into the proper register. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index f548ed4cb23f..4b300c293dec 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -574,7 +574,7 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, return 0; } - cmd = (phy << 21) | (regnum << 16); + cmd = (phy << 21) | (regnum << 16) | val; ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd); if (ret) return ret; From fa63c6434b6f6aaf9d8d599dc899bc0a074cc0ad Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:04 +0200 Subject: [PATCH 0742/1052] net: dsa: vsc73xx: check busy flag in MDIO operations The VSC73xx has a busy flag used during MDIO operations. It is raised when MDIO read/write operations are in progress. Without it, PHYs are misconfigured and bus operations do not work as expected. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 37 +++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index 4b300c293dec..a789b2da9b7d 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -40,6 +40,10 @@ #define VSC73XX_BLOCK_ARBITER 0x5 /* Only subblock 0 */ #define VSC73XX_BLOCK_SYSTEM 0x7 /* Only subblock 0 */ +/* MII Block subblock */ +#define VSC73XX_BLOCK_MII_INTERNAL 0x0 /* Internal MDIO subblock */ +#define VSC73XX_BLOCK_MII_EXTERNAL 0x1 /* External MDIO subblock */ + #define CPU_PORT 6 /* CPU port */ /* MAC Block registers */ @@ -225,6 +229,8 @@ #define VSC73XX_MII_CMD 0x1 #define VSC73XX_MII_DATA 0x2 +#define VSC73XX_MII_STAT_BUSY BIT(3) + /* Arbiter block 5 registers */ #define VSC73XX_ARBEMPTY 0x0c #define VSC73XX_ARBDISC 0x0e @@ -299,6 +305,7 @@ #define IS_739X(a) (IS_7395(a) || IS_7398(a)) #define VSC73XX_POLL_SLEEP_US 1000 +#define VSC73XX_MDIO_POLL_SLEEP_US 5 #define VSC73XX_POLL_TIMEOUT_US 10000 struct vsc73xx_counter { @@ -527,6 +534,22 @@ static int vsc73xx_detect(struct vsc73xx *vsc) return 0; } +static int vsc73xx_mdio_busy_check(struct vsc73xx *vsc) +{ + int ret, err; + u32 val; + + ret = read_poll_timeout(vsc73xx_read, err, + err < 0 || !(val & VSC73XX_MII_STAT_BUSY), + VSC73XX_MDIO_POLL_SLEEP_US, + VSC73XX_POLL_TIMEOUT_US, false, vsc, + VSC73XX_BLOCK_MII, VSC73XX_BLOCK_MII_INTERNAL, + VSC73XX_MII_STAT, &val); + if (ret) + return ret; + return err; +} + static int vsc73xx_phy_read(struct dsa_switch *ds, int phy, int regnum) { struct vsc73xx *vsc = ds->priv; @@ -534,12 +557,20 @@ static int vsc73xx_phy_read(struct dsa_switch *ds, int phy, int regnum) u32 val; int ret; + ret = vsc73xx_mdio_busy_check(vsc); + if (ret) + return ret; + /* Setting bit 26 means "read" */ cmd = BIT(26) | (phy << 21) | (regnum << 16); ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd); if (ret) return ret; - msleep(2); + + ret = vsc73xx_mdio_busy_check(vsc); + if (ret) + return ret; + ret = vsc73xx_read(vsc, VSC73XX_BLOCK_MII, 0, 2, &val); if (ret) return ret; @@ -563,6 +594,10 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, u32 cmd; int ret; + ret = vsc73xx_mdio_busy_check(vsc); + if (ret) + return ret; + /* It was found through tedious experiments that this router * chip really hates to have it's PHYs reset. They * never recover if that happens: autonegotiation stops From 9f9a72654622bae75adb1e1923d709e96ede3042 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:05 +0200 Subject: [PATCH 0743/1052] net: dsa: vsc73xx: allow phy resetting Resetting the VSC73xx PHY was problematic because the MDIO bus, without a busy check, read and wrote incorrect register values. My investigation indicates that resetting the PHY only triggers changes in configuration. However, improper register values written earlier were only exposed after a soft reset. The reset itself wasn't the issue; rather, the problem stemmed from incorrect read and write operations. A 'soft_reset' can now proceed normally. There are no reasons to keep the VSC73xx from being reset. This commit removes the reset blockade in the 'vsc73xx_phy_write' function. Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index a789b2da9b7d..e3f95d2cc2c1 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -598,17 +598,6 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, if (ret) return ret; - /* It was found through tedious experiments that this router - * chip really hates to have it's PHYs reset. They - * never recover if that happens: autonegotiation stops - * working after a reset. Just filter out this command. - * (Resetting the whole chip is OK.) - */ - if (regnum == 0 && (val & BIT(15))) { - dev_info(vsc->dev, "reset PHY - disallowed\n"); - return 0; - } - cmd = (phy << 21) | (regnum << 16) | val; ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd); if (ret) From de7a670f8defe4ed2115552ad23dea0f432f7be4 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:06 +0200 Subject: [PATCH 0744/1052] net: phy: vitesse: repair vsc73xx autonegotiation When the vsc73xx mdio bus work properly, the generic autonegotiation configuration works well. Reviewed-by: Linus Walleij Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/phy/vitesse.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/net/phy/vitesse.c b/drivers/net/phy/vitesse.c index 897b979ec03c..3b5fcaf0dd36 100644 --- a/drivers/net/phy/vitesse.c +++ b/drivers/net/phy/vitesse.c @@ -237,16 +237,6 @@ static int vsc739x_config_init(struct phy_device *phydev) return 0; } -static int vsc73xx_config_aneg(struct phy_device *phydev) -{ - /* The VSC73xx switches does not like to be instructed to - * do autonegotiation in any way, it prefers that you just go - * with the power-on/reset defaults. Writing some registers will - * just make autonegotiation permanently fail. - */ - return 0; -} - /* This adds a skew for both TX and RX clocks, so the skew should only be * applied to "rgmii-id" interfaces. It may not work as expected * on "rgmii-txid", "rgmii-rxid" or "rgmii" interfaces. @@ -444,7 +434,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc738x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { @@ -453,7 +442,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc738x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { @@ -462,7 +450,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc739x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { @@ -471,7 +458,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc739x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { From e7a9af8c93aa9f408f9972809b642faeec5287e1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Aug 2024 11:32:47 +0200 Subject: [PATCH 0745/1052] powerpc/mm: Fix size of allocated PGDIR Commit 6b0e82791bd0 ("powerpc/e500: switch to 64 bits PGD on 85xx (32 bits)") increased the size of PGD entries but failed to increase the PGD directory. Use the size of pgd_t instead of the size of pointers to calculate the allocated size. Reported-by: Guenter Roeck Fixes: 6b0e82791bd0 ("powerpc/e500: switch to 64 bits PGD on 85xx (32 bits)") Signed-off-by: Christophe Leroy Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://msgid.link/1cdaacb391cbd3e0240f0e0faf691202874e9422.1723109462.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/init-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 9b4a675eb8f8..2978fcbe307e 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -73,7 +73,7 @@ void setup_kup(void) #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ - memset(addr, 0, sizeof(void *) << (shift)); \ + memset(addr, 0, sizeof(pgd_t) << (shift)); \ } CTOR(0); CTOR(1); CTOR(2); CTOR(3); CTOR(4); CTOR(5); CTOR(6); CTOR(7); @@ -117,7 +117,7 @@ EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */ void pgtable_cache_add(unsigned int shift) { char *name; - unsigned long table_size = sizeof(void *) << shift; + unsigned long table_size = sizeof(pgd_t) << shift; unsigned long align = table_size; /* When batching pgtable pointers for RCU freeing, we store From e7e846dc6c73fbc94ae8b4ec20d05627646416f2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Aug 2024 09:05:08 +0200 Subject: [PATCH 0746/1052] powerpc/mm: Fix boot warning with hugepages and CONFIG_DEBUG_VIRTUAL Booting with CONFIG_DEBUG_VIRTUAL leads to following warning when passing hugepage reservation on command line: Kernel command line: hugepagesz=1g hugepages=1 hugepagesz=64m hugepages=1 hugepagesz=256m hugepages=1 noreboot HugeTLB: allocating 1 of page size 1.00 GiB failed. Only allocated 0 hugepages. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at arch/powerpc/include/asm/io.h:948 __alloc_bootmem_huge_page+0xd4/0x284 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc6-00396-g6b0e82791bd0-dirty #936 Hardware name: MPC8544DS e500v2 0x80210030 MPC8544 DS NIP: c1020240 LR: c10201d0 CTR: 00000000 REGS: c13fdd30 TRAP: 0700 Not tainted (6.10.0-rc6-00396-g6b0e82791bd0-dirty) MSR: 00021000 CR: 44084288 XER: 20000000 GPR00: c10201d0 c13fde20 c130b560 e8000000 e8001000 00000000 00000000 c1420000 GPR08: 00000000 00028001 00000000 00000004 44084282 01066ac0 c0eb7c9c efffe149 GPR16: c0fc4228 0000005f ffffffff c0eb7d0c c0eb7cc0 c0eb7ce0 ffffffff 00000000 GPR24: c1441cec efffe153 e8001000 c14240c0 00000000 c1441d64 00000000 e8000000 NIP [c1020240] __alloc_bootmem_huge_page+0xd4/0x284 LR [c10201d0] __alloc_bootmem_huge_page+0x64/0x284 Call Trace: [c13fde20] [c10201d0] __alloc_bootmem_huge_page+0x64/0x284 (unreliable) [c13fde50] [c10207b8] hugetlb_hstate_alloc_pages+0x8c/0x3e8 [c13fdeb0] [c1021384] hugepages_setup+0x240/0x2cc [c13fdef0] [c1000574] unknown_bootoption+0xfc/0x280 [c13fdf30] [c0078904] parse_args+0x200/0x4c4 [c13fdfa0] [c1000d9c] start_kernel+0x238/0x7d0 [c13fdff0] [c0000434] set_ivor+0x12c/0x168 Code: 554aa33e 7c042840 3ce0c142 80a7427c 5109a016 50caa016 7c9a2378 7fdcf378 4180000c 7c052040 41810160 7c095040 <0fe00000> 38c00000 40800108 3c60c0eb ---[ end trace 0000000000000000 ]--- This is due to virt_addr_valid() using high_memory before it is set. high_memory is set in mem_init() using max_low_pfn, but max_low_pfn is available long before, it is set in mem_topology_setup(). So just like commit daa9ada2093e ("powerpc/mm: Fix boot crash with FLATMEM") moved the setting of max_mapnr immediately after the call to mem_topology_setup(), the same can be done for high_memory. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/62b69c4baad067093f39e7e60df0fe27a86b8d2a.1723100702.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/setup-common.c | 1 + arch/powerpc/mm/mem.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 4bd2f87616ba..943430077375 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -959,6 +959,7 @@ void __init setup_arch(char **cmdline_p) mem_topology_setup(); /* Set max_mapnr before paging_init() */ set_max_mapnr(max_pfn); + high_memory = (void *)__va(max_low_pfn * PAGE_SIZE); /* * Release secondary cpus out of their spinloops at 0x60 now that diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index d325217ab201..da21cb018984 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -290,8 +290,6 @@ void __init mem_init(void) swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags); #endif - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - kasan_late_init(); memblock_free_all(); From c25504a0ba36968f919aa30caff172ef23346299 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 9 Aug 2024 16:06:53 -0400 Subject: [PATCH 0747/1052] dt-bindings: net: fsl,qoriq-mc-dpmac: add missed property phys Add missed property phys, which indicate how connect to serdes phy. Fix below warning: arch/arm64/boot/dts/freescale/fsl-lx2160a-honeycomb.dtb: fsl-mc@80c000000: dpmacs:ethernet@7: Unevaluated properties are not allowed ('phys' was unexpected) Signed-off-by: Frank Li Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml b/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml index a1b71b35319e..42f9843d1868 100644 --- a/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml +++ b/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml @@ -38,6 +38,10 @@ properties: managed: true + phys: + description: A reference to the SerDes lane(s) + maxItems: 1 + required: - reg From 32316f676b4ee87c0404d333d248ccf777f739bc Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 9 Aug 2024 14:01:24 -0700 Subject: [PATCH 0748/1052] net: mana: Fix RX buf alloc_size alignment and atomic op panic The MANA driver's RX buffer alloc_size is passed into napi_build_skb() to create SKB. skb_shinfo(skb) is located at the end of skb, and its alignment is affected by the alloc_size passed into napi_build_skb(). The size needs to be aligned properly for better performance and atomic operations. Otherwise, on ARM64 CPU, for certain MTU settings like 4000, atomic operations may panic on the skb_shinfo(skb)->dataref due to alignment fault. To fix this bug, add proper alignment to the alloc_size calculation. Sample panic info: [ 253.298819] Unable to handle kernel paging request at virtual address ffff000129ba5cce [ 253.300900] Mem abort info: [ 253.301760] ESR = 0x0000000096000021 [ 253.302825] EC = 0x25: DABT (current EL), IL = 32 bits [ 253.304268] SET = 0, FnV = 0 [ 253.305172] EA = 0, S1PTW = 0 [ 253.306103] FSC = 0x21: alignment fault Call trace: __skb_clone+0xfc/0x198 skb_clone+0x78/0xe0 raw6_local_deliver+0xfc/0x228 ip6_protocol_deliver_rcu+0x80/0x500 ip6_input_finish+0x48/0x80 ip6_input+0x48/0xc0 ip6_sublist_rcv_finish+0x50/0x78 ip6_sublist_rcv+0x1cc/0x2b8 ipv6_list_rcv+0x100/0x150 __netif_receive_skb_list_core+0x180/0x220 netif_receive_skb_list_internal+0x198/0x2a8 __napi_poll+0x138/0x250 net_rx_action+0x148/0x330 handle_softirqs+0x12c/0x3a0 Cc: stable@vger.kernel.org Fixes: 80f6215b450e ("net: mana: Add support for jumbo frame") Signed-off-by: Haiyang Zhang Reviewed-by: Long Li Signed-off-by: David S. Miller --- drivers/net/ethernet/microsoft/mana/mana_en.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index d2f07e179e86..ae717d06e66f 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -599,7 +599,11 @@ static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, else *headroom = XDP_PACKET_HEADROOM; - *alloc_size = mtu + MANA_RXBUF_PAD + *headroom; + *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + + /* Using page pool in this case, so alloc_size is PAGE_SIZE */ + if (*alloc_size < PAGE_SIZE) + *alloc_size = PAGE_SIZE; *datasize = mtu + ETH_HLEN; } From db1b4bedb9b97c6d34b03d03815147c04fffe8b4 Mon Sep 17 00:00:00 2001 From: Zheng Zhang Date: Sat, 10 Aug 2024 13:26:51 +0800 Subject: [PATCH 0749/1052] net: ethernet: mtk_wed: fix use-after-free panic in mtk_wed_setup_tc_block_cb() When there are multiple ap interfaces on one band and with WED on, turning the interface down will cause a kernel panic on MT798X. Previously, cb_priv was freed in mtk_wed_setup_tc_block() without marking NULL,and mtk_wed_setup_tc_block_cb() didn't check the value, too. Assign NULL after free cb_priv in mtk_wed_setup_tc_block() and check NULL in mtk_wed_setup_tc_block_cb(). ---------- Unable to handle kernel paging request at virtual address 0072460bca32b4f5 Call trace: mtk_wed_setup_tc_block_cb+0x4/0x38 0xffffffc0794084bc tcf_block_playback_offloads+0x70/0x1e8 tcf_block_unbind+0x6c/0xc8 ... --------- Fixes: 799684448e3e ("net: ethernet: mtk_wed: introduce wed wo support") Signed-off-by: Zheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_wed.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 61334a71058c..e212a4ba9275 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -2666,14 +2666,15 @@ mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_pri { struct mtk_wed_flow_block_priv *priv = cb_priv; struct flow_cls_offload *cls = type_data; - struct mtk_wed_hw *hw = priv->hw; + struct mtk_wed_hw *hw = NULL; - if (!tc_can_offload(priv->dev)) + if (!priv || !tc_can_offload(priv->dev)) return -EOPNOTSUPP; if (type != TC_SETUP_CLSFLOWER) return -EOPNOTSUPP; + hw = priv->hw; return mtk_flow_offload_cmd(hw->eth, cls, hw->index); } @@ -2729,6 +2730,7 @@ mtk_wed_setup_tc_block(struct mtk_wed_hw *hw, struct net_device *dev, flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); kfree(block_cb->cb_priv); + block_cb->cb_priv = NULL; } return 0; default: From 497d370a644d95a9f04271aa92cb96d32e84c770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Fri, 9 Aug 2024 12:18:45 -0300 Subject: [PATCH 0750/1052] drm/v3d: Fix out-of-bounds read in `v3d_csd_job_run()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When enabling UBSAN on Raspberry Pi 5, we get the following warning: [ 387.894977] UBSAN: array-index-out-of-bounds in drivers/gpu/drm/v3d/v3d_sched.c:320:3 [ 387.903868] index 7 is out of range for type '__u32 [7]' [ 387.909692] CPU: 0 PID: 1207 Comm: kworker/u16:2 Tainted: G WC 6.10.3-v8-16k-numa #151 [ 387.919166] Hardware name: Raspberry Pi 5 Model B Rev 1.0 (DT) [ 387.925961] Workqueue: v3d_csd drm_sched_run_job_work [gpu_sched] [ 387.932525] Call trace: [ 387.935296] dump_backtrace+0x170/0x1b8 [ 387.939403] show_stack+0x20/0x38 [ 387.942907] dump_stack_lvl+0x90/0xd0 [ 387.946785] dump_stack+0x18/0x28 [ 387.950301] __ubsan_handle_out_of_bounds+0x98/0xd0 [ 387.955383] v3d_csd_job_run+0x3a8/0x438 [v3d] [ 387.960707] drm_sched_run_job_work+0x520/0x6d0 [gpu_sched] [ 387.966862] process_one_work+0x62c/0xb48 [ 387.971296] worker_thread+0x468/0x5b0 [ 387.975317] kthread+0x1c4/0x1e0 [ 387.978818] ret_from_fork+0x10/0x20 [ 387.983014] ---[ end trace ]--- This happens because the UAPI provides only seven configuration registers and we are reading the eighth position of this u32 array. Therefore, fix the out-of-bounds read in `v3d_csd_job_run()` by accessing only seven positions on the '__u32 [7]' array. The eighth register exists indeed on V3D 7.1, but it isn't currently used. That being so, let's guarantee that it remains unused and add a note that it could be set in a future patch. Fixes: 0ad5bc1ce463 ("drm/v3d: fix up register addresses for V3D 7.x") Reported-by: Tvrtko Ursulin Signed-off-by: Maíra Canal Reviewed-by: Iago Toral Quiroga Link: https://patchwork.freedesktop.org/patch/msgid/20240809152001.668314-1-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 9bd7453b25ad..b8682818bafa 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -315,7 +315,7 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) struct v3d_dev *v3d = job->base.v3d; struct drm_device *dev = &v3d->drm; struct dma_fence *fence; - int i, csd_cfg0_reg, csd_cfg_reg_count; + int i, csd_cfg0_reg; v3d->csd_job = job; @@ -335,9 +335,17 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) v3d_switch_perfmon(v3d, &job->base); csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver); - csd_cfg_reg_count = v3d->ver < 71 ? 6 : 7; - for (i = 1; i <= csd_cfg_reg_count; i++) + for (i = 1; i <= 6; i++) V3D_CORE_WRITE(0, csd_cfg0_reg + 4 * i, job->args.cfg[i]); + + /* Although V3D 7.1 has an eighth configuration register, we are not + * using it. Therefore, make sure it remains unused. + * + * XXX: Set the CFG7 register + */ + if (v3d->ver >= 71) + V3D_CORE_WRITE(0, V3D_V7_CSD_QUEUED_CFG7, 0); + /* CFG0 write kicks off the job. */ V3D_CORE_WRITE(0, csd_cfg0_reg, job->args.cfg[0]); From 2a07bb64d80152701d507b1498237ed1b8d83866 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Mon, 12 Aug 2024 14:57:32 +0200 Subject: [PATCH 0751/1052] s390/dasd: Remove DMA alignment This reverts commit bc792884b76f ("s390/dasd: Establish DMA alignment"). Quoting the original commit: linux-next commit bf8d08532bc1 ("iomap: add support for dma aligned direct-io") changes the alignment requirement to come from the block device rather than the block size, and the default alignment requirement is 512-byte boundaries. Since DASD I/O has page alignments for IDAW/TIDAW requests, let's override this value to restore the expected behavior. I mentioned TIDAW, but that was wrong. TIDAWs have no distinct alignment requirement (per p. 15-70 of POPS SA22-7832-13): Unless otherwise specified, TIDAWs may designate a block of main storage on any boundary and length up to 4K bytes, provided the specified block does not cross a 4 K-byte boundary. IDAWs do, but the original commit neglected that while ECKD DASD are typically formatted in 4096-byte blocks, they don't HAVE to be. Formatting an ECKD volume with smaller blocks is permitted (dasdfmt -b xxx), and the problematic commit enforces alignment properties to such a device that will result in errors, such as: [test@host ~]# lsdasd -l a367 | grep blksz blksz: 512 [test@host ~]# mkfs.xfs -f /dev/disk/by-path/ccw-0.0.a367-part1 meta-data=/dev/dasdc1 isize=512 agcount=4, agsize=230075 blks = sectsz=512 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=1 = reflink=1 bigtime=1 inobtcount=1 nrext64=1 data = bsize=4096 blocks=920299, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0, ftype=1 log =internal log bsize=4096 blocks=16384, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 error reading existing superblock: Invalid argument mkfs.xfs: pwrite failed: Invalid argument libxfs_bwrite: write failed on (unknown) bno 0x70565c/0x100, err=22 mkfs.xfs: Releasing dirty buffer to free list! found dirty buffer (bulk) on free list! mkfs.xfs: pwrite failed: Invalid argument ...snipped... The original commit omitted the FBA discipline for just this reason, but the formatted block size of the other disciplines was overlooked. The solution to all of this is to revert to the original behavior, such that the block size can be respected. There were two commits [1] that moved this code in the interim, so a straight git-revert is not possible, but the change is straightforward. But what of the original problem? That was manifested with a direct-io QEMU guest, where QEMU itself was changed a month or two later with commit 25474d90aa ("block: use the request length for iov alignment") such that the blamed kernel commit is unnecessary. [1] commit 0127a47f58c6 ("dasd: move queue setup to common code") commit fde07a4d74e3 ("dasd: use the atomic queue limits API") Fixes: bc792884b76f ("s390/dasd: Establish DMA alignment") Reviewed-by: Stefan Haberland Signed-off-by: Eric Farman Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20240812125733.126431-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 1aa426b1dedd..6da47a65af61 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -41,7 +41,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) */ .max_segment_size = PAGE_SIZE, .seg_boundary_mask = PAGE_SIZE - 1, - .dma_alignment = PAGE_SIZE - 1, .max_segments = USHRT_MAX, }; struct gendisk *gdp; From 7db4042336580dfd75cb5faa82c12cd51098c90b Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Mon, 12 Aug 2024 14:57:33 +0200 Subject: [PATCH 0752/1052] s390/dasd: fix error recovery leading to data corruption on ESE devices Extent Space Efficient (ESE) or thin provisioned volumes need to be formatted on demand during usual IO processing. The dasd_ese_needs_format function checks for error codes that signal the non existence of a proper track format. The check for incorrect length is to imprecise since other error cases leading to transport of insufficient data also have this flag set. This might lead to data corruption in certain error cases for example during a storage server warmstart. Fix by removing the check for incorrect length and replacing by explicitly checking for invalid track format in transport mode. Also remove the check for file protected since this is not a valid ESE handling case. Cc: stable@vger.kernel.org # 5.3+ Fixes: 5e2b17e712cf ("s390/dasd: Add dynamic formatting support for ESE volumes") Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20240812125733.126431-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 36 ++++++++++++------- drivers/s390/block/dasd_3990_erp.c | 10 ++---- drivers/s390/block/dasd_eckd.c | 57 +++++++++++++----------------- drivers/s390/block/dasd_int.h | 2 +- 4 files changed, 51 insertions(+), 54 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 0a97cfedd706..42a4a996defb 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1601,9 +1601,15 @@ static int dasd_ese_needs_format(struct dasd_block *block, struct irb *irb) if (!sense) return 0; - return !!(sense[1] & SNS1_NO_REC_FOUND) || - !!(sense[1] & SNS1_FILE_PROTECTED) || - scsw_cstat(&irb->scsw) == SCHN_STAT_INCORR_LEN; + if (sense[1] & SNS1_NO_REC_FOUND) + return 1; + + if ((sense[1] & SNS1_INV_TRACK_FORMAT) && + scsw_is_tm(&irb->scsw) && + !(sense[2] & SNS2_ENV_DATA_PRESENT)) + return 1; + + return 0; } static int dasd_ese_oos_cond(u8 *sense) @@ -1624,7 +1630,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, struct dasd_device *device; unsigned long now; int nrf_suppressed = 0; - int fp_suppressed = 0; + int it_suppressed = 0; struct request *req; u8 *sense = NULL; int expires; @@ -1679,8 +1685,9 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, */ sense = dasd_get_sense(irb); if (sense) { - fp_suppressed = (sense[1] & SNS1_FILE_PROTECTED) && - test_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); + it_suppressed = (sense[1] & SNS1_INV_TRACK_FORMAT) && + !(sense[2] & SNS2_ENV_DATA_PRESENT) && + test_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); nrf_suppressed = (sense[1] & SNS1_NO_REC_FOUND) && test_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); @@ -1695,7 +1702,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, return; } } - if (!(fp_suppressed || nrf_suppressed)) + if (!(it_suppressed || nrf_suppressed)) device->discipline->dump_sense_dbf(device, irb, "int"); if (device->features & DASD_FEATURE_ERPLOG) @@ -2459,14 +2466,17 @@ static int _dasd_sleep_on_queue(struct list_head *ccw_queue, int interruptible) rc = 0; list_for_each_entry_safe(cqr, n, ccw_queue, blocklist) { /* - * In some cases the 'File Protected' or 'Incorrect Length' - * error might be expected and error recovery would be - * unnecessary in these cases. Check if the according suppress - * bit is set. + * In some cases certain errors might be expected and + * error recovery would be unnecessary in these cases. + * Check if the according suppress bit is set. */ sense = dasd_get_sense(&cqr->irb); - if (sense && sense[1] & SNS1_FILE_PROTECTED && - test_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags)) + if (sense && (sense[1] & SNS1_INV_TRACK_FORMAT) && + !(sense[2] & SNS2_ENV_DATA_PRESENT) && + test_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags)) + continue; + if (sense && (sense[1] & SNS1_NO_REC_FOUND) && + test_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags)) continue; if (scsw_cstat(&cqr->irb.scsw) == 0x40 && test_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags)) diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index bbbacfc386f2..d0aa267462c5 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -1386,14 +1386,8 @@ dasd_3990_erp_file_prot(struct dasd_ccw_req * erp) struct dasd_device *device = erp->startdev; - /* - * In some cases the 'File Protected' error might be expected and - * log messages shouldn't be written then. - * Check if the according suppress bit is set. - */ - if (!test_bit(DASD_CQR_SUPPRESS_FP, &erp->flags)) - dev_err(&device->cdev->dev, - "Accessing the DASD failed because of a hardware error\n"); + dev_err(&device->cdev->dev, + "Accessing the DASD failed because of a hardware error\n"); return dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 9388b5c383ca..90b106408992 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2275,6 +2275,7 @@ dasd_eckd_analysis_ccw(struct dasd_device *device) cqr->status = DASD_CQR_FILLED; /* Set flags to suppress output for expected errors */ set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); + set_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); return cqr; } @@ -2556,7 +2557,6 @@ dasd_eckd_build_check_tcw(struct dasd_device *base, struct format_data_t *fdata, cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; /* Set flags to suppress output for expected errors */ - set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); return cqr; @@ -4130,8 +4130,6 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( /* Set flags to suppress output for expected errors */ if (dasd_eckd_is_ese(basedev)) { - set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); - set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); } @@ -4633,9 +4631,8 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( /* Set flags to suppress output for expected errors */ if (dasd_eckd_is_ese(basedev)) { - set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); - set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); + set_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); } return cqr; @@ -5780,36 +5777,32 @@ static void dasd_eckd_dump_sense(struct dasd_device *device, { u8 *sense = dasd_get_sense(irb); - if (scsw_is_tm(&irb->scsw)) { - /* - * In some cases the 'File Protected' or 'Incorrect Length' - * error might be expected and log messages shouldn't be written - * then. Check if the according suppress bit is set. - */ - if (sense && (sense[1] & SNS1_FILE_PROTECTED) && - test_bit(DASD_CQR_SUPPRESS_FP, &req->flags)) - return; - if (scsw_cstat(&irb->scsw) == 0x40 && - test_bit(DASD_CQR_SUPPRESS_IL, &req->flags)) - return; + /* + * In some cases certain errors might be expected and + * log messages shouldn't be written then. + * Check if the according suppress bit is set. + */ + if (sense && (sense[1] & SNS1_INV_TRACK_FORMAT) && + !(sense[2] & SNS2_ENV_DATA_PRESENT) && + test_bit(DASD_CQR_SUPPRESS_IT, &req->flags)) + return; + if (sense && sense[0] & SNS0_CMD_REJECT && + test_bit(DASD_CQR_SUPPRESS_CR, &req->flags)) + return; + + if (sense && sense[1] & SNS1_NO_REC_FOUND && + test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags)) + return; + + if (scsw_cstat(&irb->scsw) == 0x40 && + test_bit(DASD_CQR_SUPPRESS_IL, &req->flags)) + return; + + if (scsw_is_tm(&irb->scsw)) dasd_eckd_dump_sense_tcw(device, req, irb); - } else { - /* - * In some cases the 'Command Reject' or 'No Record Found' - * error might be expected and log messages shouldn't be - * written then. Check if the according suppress bit is set. - */ - if (sense && sense[0] & SNS0_CMD_REJECT && - test_bit(DASD_CQR_SUPPRESS_CR, &req->flags)) - return; - - if (sense && sense[1] & SNS1_NO_REC_FOUND && - test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags)) - return; - + else dasd_eckd_dump_sense_ccw(device, req, irb); - } } static int dasd_eckd_reload_device(struct dasd_device *device) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index e5f40536b425..81cfb5c89681 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -196,7 +196,7 @@ struct dasd_ccw_req { * The following flags are used to suppress output of certain errors. */ #define DASD_CQR_SUPPRESS_NRF 4 /* Suppress 'No Record Found' error */ -#define DASD_CQR_SUPPRESS_FP 5 /* Suppress 'File Protected' error*/ +#define DASD_CQR_SUPPRESS_IT 5 /* Suppress 'Invalid Track' error*/ #define DASD_CQR_SUPPRESS_IL 6 /* Suppress 'Incorrect Length' error */ #define DASD_CQR_SUPPRESS_CR 7 /* Suppress 'Command Reject' error */ From 84f2eecf95018386c145ada19bb45b03bdb80d9e Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 11 Aug 2024 14:07:11 -0400 Subject: [PATCH 0753/1052] io_uring/napi: check napi_enabled in io_napi_add() before proceeding doing so avoids the overhead of adding napi ids to all the rings that do not enable napi. if no id is added to napi_list because napi is disabled, __io_napi_busy_loop() will not be called. Signed-off-by: Olivier Langlois Fixes: b4ccc4dd1330 ("io_uring/napi: enable even with a timeout of 0") Link: https://lore.kernel.org/r/bd989ccef5fda14f5fd9888faf4fefcf66bd0369.1723400131.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 2 +- io_uring/napi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index a3dc3762008f..73c4159e8405 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -302,7 +302,7 @@ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); - if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled) + if (!(ctx->flags & IORING_SETUP_SQPOLL)) io_napi_blocking_busy_loop(ctx, iowq); } diff --git a/io_uring/napi.h b/io_uring/napi.h index 88f1c21d5548..27b88c3eb428 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -55,7 +55,7 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_busy_poll_dt)) + if (!READ_ONCE(ctx->napi_enabled)) return; sock = sock_from_file(req->file); From 48cc7ecd3a68e0fbfa281ef1ed6f6b6cb7638390 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 11 Aug 2024 20:34:46 -0400 Subject: [PATCH 0754/1052] io_uring/napi: remove duplicate io_napi_entry timeout assignation io_napi_entry() has 2 calling sites. One of them is unlikely to find an entry and if it does, the timeout should arguable not be updated. The other io_napi_entry() calling site is overwriting the update made by io_napi_entry() so the io_napi_entry() timeout value update has no or little value and therefore is removed. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/145b54ff179f87609e20dffaf5563c07cdbcad1a.1723423275.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 73c4159e8405..1de1d4d62925 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -26,7 +26,6 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, hlist_for_each_entry_rcu(e, hash_list, node) { if (e->napi_id != napi_id) continue; - e->timeout = jiffies + NAPI_TIMEOUT; return e; } From 98055bc3595500bcf2126b93b1595354bdb86a66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 May 2024 21:17:32 +0100 Subject: [PATCH 0755/1052] netfs: Fault in smaller chunks for non-large folio mappings As in commit 4e527d5841e2 ("iomap: fault in smaller chunks for non-large folio mappings"), we can see a performance loss for filesystems which have not yet been converted to large folios. Fixes: c38f4e96e605 ("netfs: Provide func to copy data to pagecache for buffered write") Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240527201735.1898381-1-willy@infradead.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 4726c315453c..ca53c5d1622e 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -184,7 +184,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; loff_t i_size, pos = iocb->ki_pos, from, to; - size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || From 3f65f3c099bcb27949e712f39ba836f21785924a Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 29 Jul 2024 15:48:12 -0700 Subject: [PATCH 0756/1052] filelock: fix name of file_lease slab cache When struct file_lease was split out from struct file_lock, the name of the file_lock slab cache was copied to the new slab cache for file_lease. This name conflict causes confusion in /proc/slabinfo and /sys/kernel/slab. In particular, it caused failures in drgn's test case for slab cache merging. Link: https://github.com/osandov/drgn/blob/9ad29fd86499eb32847473e928b6540872d3d59a/tests/linux_kernel/helpers/test_slab.py#L81 Fixes: c69ff4071935 ("filelock: split leases out of struct file_lock") Signed-off-by: Omar Sandoval Link: https://lore.kernel.org/r/2d1d053da1cafb3e7940c4f25952da4f0af34e38.1722293276.git.osandov@fb.com Reviewed-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index 9afb16e0683f..e45cad40f8b6 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2984,7 +2984,7 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - filelease_cache = kmem_cache_create("file_lock_cache", + filelease_cache = kmem_cache_create("file_lease_cache", sizeof(struct file_lease), 0, SLAB_PANIC, NULL); for_each_possible_cpu(i) { From f71aa06398aabc2e3eaac25acdf3d62e0094ba70 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 29 Jul 2024 17:19:30 +0100 Subject: [PATCH 0757/1052] fs/netfs/fscache_cookie: add missing "n_accesses" check This fixes a NULL pointer dereference bug due to a data race which looks like this: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 33 PID: 16573 Comm: kworker/u97:799 Not tainted 6.8.7-cm4all1-hp+ #43 Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 10/17/2018 Workqueue: events_unbound netfs_rreq_write_to_cache_work RIP: 0010:cachefiles_prepare_write+0x30/0xa0 Code: 57 41 56 45 89 ce 41 55 49 89 cd 41 54 49 89 d4 55 53 48 89 fb 48 83 ec 08 48 8b 47 08 48 83 7f 10 00 48 89 34 24 48 8b 68 20 <48> 8b 45 08 4c 8b 38 74 45 49 8b 7f 50 e8 4e a9 b0 ff 48 8b 73 10 RSP: 0018:ffffb4e78113bde0 EFLAGS: 00010286 RAX: ffff976126be6d10 RBX: ffff97615cdb8438 RCX: 0000000000020000 RDX: ffff97605e6c4c68 RSI: ffff97605e6c4c60 RDI: ffff97615cdb8438 RBP: 0000000000000000 R08: 0000000000278333 R09: 0000000000000001 R10: ffff97605e6c4600 R11: 0000000000000001 R12: ffff97605e6c4c68 R13: 0000000000020000 R14: 0000000000000001 R15: ffff976064fe2c00 FS: 0000000000000000(0000) GS:ffff9776dfd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000005942c002 CR4: 00000000001706f0 Call Trace: ? __die+0x1f/0x70 ? page_fault_oops+0x15d/0x440 ? search_module_extables+0xe/0x40 ? fixup_exception+0x22/0x2f0 ? exc_page_fault+0x5f/0x100 ? asm_exc_page_fault+0x22/0x30 ? cachefiles_prepare_write+0x30/0xa0 netfs_rreq_write_to_cache_work+0x135/0x2e0 process_one_work+0x137/0x2c0 worker_thread+0x2e9/0x400 ? __pfx_worker_thread+0x10/0x10 kthread+0xcc/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Modules linked in: CR2: 0000000000000008 ---[ end trace 0000000000000000 ]--- This happened because fscache_cookie_state_machine() was slow and was still running while another process invoked fscache_unuse_cookie(); this led to a fscache_cookie_lru_do_one() call, setting the FSCACHE_COOKIE_DO_LRU_DISCARD flag, which was picked up by fscache_cookie_state_machine(), withdrawing the cookie via cachefiles_withdraw_cookie(), clearing cookie->cache_priv. At the same time, yet another process invoked cachefiles_prepare_write(), which found a NULL pointer in this code line: struct cachefiles_object *object = cachefiles_cres_object(cres); The next line crashes, obviously: struct cachefiles_cache *cache = object->volume->cache; During cachefiles_prepare_write(), the "n_accesses" counter is non-zero (via fscache_begin_operation()). The cookie must not be withdrawn until it drops to zero. The counter is checked by fscache_cookie_state_machine() before switching to FSCACHE_COOKIE_STATE_RELINQUISHING and FSCACHE_COOKIE_STATE_WITHDRAWING (in "case FSCACHE_COOKIE_STATE_FAILED"), but not for FSCACHE_COOKIE_STATE_LRU_DISCARDING ("case FSCACHE_COOKIE_STATE_ACTIVE"). This patch adds the missing check. With a non-zero access counter, the function returns and the next fscache_end_cookie_access() call will queue another fscache_cookie_state_machine() call to handle the still-pending FSCACHE_COOKIE_DO_LRU_DISCARD. Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning") Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240729162002.3436763-2-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/fscache_cookie.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0..d4d4b3a8b106 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -741,6 +741,10 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie) spin_lock(&cookie->lock); } if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) { + if (atomic_read(&cookie->n_accesses) != 0) + /* still being accessed: postpone it */ + break; + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_LRU_DISCARDING); wake = true; From 42b0f8da3acc87953161baeb24f756936eb4d4b2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 31 Jul 2024 07:47:27 +0200 Subject: [PATCH 0758/1052] nsfs: fix ioctl declaration The kernel is writing an object of type __u64, so the ioctl has to be defined to _IOR(NSIO, 0x5, __u64) instead of _IO(NSIO, 0x5). Reported-by: Dmitry V. Levin Link: https://lore.kernel.org/r/20240730164554.GA18486@altlinux.org Signed-off-by: Christian Brauner --- include/uapi/linux/nsfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index b133211331f6..5fad3d0fcd70 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -3,6 +3,7 @@ #define __LINUX_NSFS_H #include +#include #define NSIO 0xb7 @@ -16,7 +17,7 @@ /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) /* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IO(NSIO, 0x5) +#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ From 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a Mon Sep 17 00:00:00 2001 From: yangerkun Date: Wed, 31 Jul 2024 12:38:35 +0800 Subject: [PATCH 0759/1052] libfs: fix infinite directory reads for offset dir After we switch tmpfs dir operations from simple_dir_operations to simple_offset_dir_operations, every rename happened will fill new dentry to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free key starting with octx->newx_offset, and then set newx_offset equals to free key + 1. This will lead to infinite readdir combine with rename happened at the same time, which fail generic/736 in xfstests(detail show as below). 1. create 5000 files(1 2 3...) under one dir 2. call readdir(man 3 readdir) once, and get one entry 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry) 4. loop 2~3, until readdir return nothing or we loop too many times(tmpfs break test with the second condition) We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite directory reads") to fix it, record the last_index when we open dir, and do not emit the entry which index >= last_index. The file->private_data now used in offset dir can use directly to do this, and we also update the last_index when we llseek the dir file. Fixes: a2e459555c5f ("shmem: stable directory offsets") Signed-off-by: yangerkun Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com Reviewed-by: Chuck Lever [brauner: only update last_index after seek when offset is zero like Jan suggested] Signed-off-by: Christian Brauner --- fs/libfs.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 8aa34870449f..02602d00939e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -450,6 +450,14 @@ void simple_offset_destroy(struct offset_ctx *octx) mtree_destroy(&octx->mt); } +static int offset_dir_open(struct inode *inode, struct file *file) +{ + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + + file->private_data = (void *)ctx->next_offset; + return 0; +} + /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated @@ -463,6 +471,9 @@ void simple_offset_destroy(struct offset_ctx *octx) */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { + struct inode *inode = file->f_inode; + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -476,7 +487,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) } /* In this case, ->private_data is protected by f_pos_lock */ - file->private_data = NULL; + if (!offset) + file->private_data = (void *)ctx->next_offset; return vfs_setpos(file, offset, LONG_MAX); } @@ -507,7 +519,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) { struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; @@ -515,17 +527,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) while (true) { dentry = offset_find_next(octx, ctx->pos); if (!dentry) - return ERR_PTR(-ENOENT); + return; + + if (dentry2offset(dentry) >= last_index) { + dput(dentry); + return; + } if (!offset_dir_emit(ctx, dentry)) { dput(dentry); - break; + return; } ctx->pos = dentry2offset(dentry) + 1; dput(dentry); } - return NULL; } /** @@ -552,22 +568,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; + long last_index = (long)file->private_data; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; - /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == DIR_OFFSET_MIN) - file->private_data = NULL; - else if (file->private_data == ERR_PTR(-ENOENT)) - return 0; - file->private_data = offset_iterate_dir(d_inode(dir), ctx); + offset_iterate_dir(d_inode(dir), ctx, last_index); return 0; } const struct file_operations simple_offset_dir_operations = { + .open = offset_dir_open, .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, From 889ced4c9388785952d78d20d338bda2df209bb5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 31 Jul 2024 09:39:02 +0200 Subject: [PATCH 0760/1052] netfs: clean up after renaming FSCACHE_DEBUG config Commit 6b8e61472529 ("netfs: Rename CONFIG_FSCACHE_DEBUG to CONFIG_NETFS_DEBUG") renames the config, but introduces two issues: First, NETFS_DEBUG mistakenly depends on the non-existing config NETFS, whereas the actual intended config is called NETFS_SUPPORT. Second, the config renaming misses to adjust the documentation of the functionality of this config. Clean up those two points. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20240731073902.69262-1-lukas.bulwahn@redhat.com Signed-off-by: Christian Brauner --- Documentation/filesystems/caching/fscache.rst | 8 ++++---- fs/netfs/Kconfig | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst index a74d7b052dc1..de1f32526cc1 100644 --- a/Documentation/filesystems/caching/fscache.rst +++ b/Documentation/filesystems/caching/fscache.rst @@ -318,10 +318,10 @@ where the columns are: Debugging ========= -If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime -debugging enabled by adjusting the value in:: +If CONFIG_NETFS_DEBUG is enabled, the FS-Cache facility and NETFS support can +have runtime debugging enabled by adjusting the value in:: - /sys/module/fscache/parameters/debug + /sys/module/netfs/parameters/debug This is a bitmask of debugging streams to enable: @@ -343,6 +343,6 @@ This is a bitmask of debugging streams to enable: The appropriate set of values should be OR'd together and the result written to the control file. For example:: - echo $((1|8|512)) >/sys/module/fscache/parameters/debug + echo $((1|8|512)) >/sys/module/netfs/parameters/debug will turn on all function entry debugging. diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index 1b78e8b65ebc..7701c037c328 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -24,7 +24,7 @@ config NETFS_STATS config NETFS_DEBUG bool "Enable dynamic debugging netfslib and FS-Cache" - depends on NETFS + depends on NETFS_SUPPORT help This permits debugging to be dynamically enabled in the local caching management module. If this is set, the debugging output may be From 3b5bbe798b2451820e74243b738268f51901e7d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 31 Jul 2024 12:01:12 +0200 Subject: [PATCH 0761/1052] pidfd: prevent creation of pidfds for kthreads It's currently possible to create pidfds for kthreads but it is unclear what that is supposed to mean. Until we have use-cases for it and we figured out what behavior we want block the creation of pidfds for kthreads. Link: https://lore.kernel.org/r/20240731-gleis-mehreinnahmen-6bbadd128383@brauner Fixes: 32fcb426ec00 ("pid: add pidfd_open()") Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/fork.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f201..18bdc87209d0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2053,11 +2053,24 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - bool thread = flags & PIDFD_THREAD; - - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + if (!pid) return -EINVAL; + scoped_guard(rcu) { + struct task_struct *tsk; + + if (flags & PIDFD_THREAD) + tsk = pid_task(pid, PIDTYPE_PID); + else + tsk = pid_task(pid, PIDTYPE_TGID); + if (!tsk) + return -EINVAL; + + /* Don't create pidfds for kernel threads for now. */ + if (tsk->flags & PF_KTHREAD) + return -EINVAL; + } + return __pidfd_prepare(pid, flags, ret); } @@ -2403,6 +2416,12 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + /* Don't create pidfds for kernel threads for now. */ + if (args->kthread) { + retval = -EINVAL; + goto bad_fork_free_pid; + } + /* Note that no task has been attached to @pid yet. */ retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) From 86509e38a80da34d7800985fa2be183475242c8c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 9 Aug 2024 15:50:35 +0200 Subject: [PATCH 0762/1052] file: fix typo in take_fd() comment The explanatory comment above take_fd() contains a typo, fix that to not confuse readers. Signed-off-by: Mathias Krause Link: https://lore.kernel.org/r/20240809135035.748109-1-minipli@grsecurity.net Signed-off-by: Christian Brauner --- include/linux/file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/file.h b/include/linux/file.h index 237931f20739..59b146a14dca 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -110,7 +110,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), * * f = dentry_open(&path, O_RDONLY, current_cred()); * if (IS_ERR(f)) - * return PTR_ERR(fd); + * return PTR_ERR(f); * * fd_install(fd, f); * return take_fd(fd); From 8e5ced7804cb9184c4a23f8054551240562a8eda Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Jul 2024 17:01:40 +0100 Subject: [PATCH 0763/1052] netfs, ceph: Revert "netfs: Remove deprecated use of PG_private_2 as a second writeback flag" This reverts commit ae678317b95e760607c7b20b97c9cd4ca9ed6e1a. Revert the patch that removes the deprecated use of PG_private_2 in netfslib for the moment as Ceph is actually still using this to track data copied to the cache. Fixes: ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag") Reported-by: Max Kellermann Signed-off-by: David Howells cc: Ilya Dryomov cc: Xiubo Li cc: Jeff Layton cc: Matthew Wilcox cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org https: //lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk Signed-off-by: Christian Brauner --- fs/ceph/addr.c | 19 ++++- fs/netfs/buffered_read.c | 8 +- fs/netfs/io.c | 144 +++++++++++++++++++++++++++++++++++ include/trace/events/netfs.h | 1 + 4 files changed, 170 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c16bc5250ef..73b5a07bf94d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -498,6 +498,11 @@ const struct netfs_request_ops ceph_netfs_ops = { }; #ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) +{ + folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ +} + static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) { struct inode *inode = priv; @@ -515,6 +520,10 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b ceph_fscache_write_terminated, inode, true, caching); } #else +static inline void ceph_set_page_fscache(struct page *page) +{ +} + static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { } @@ -706,6 +715,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = wlen; set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { @@ -789,6 +800,8 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return AOP_WRITEPAGE_ACTIVATE; } + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ + err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 @@ -1062,7 +1075,8 @@ static int ceph_writepages_start(struct address_space *mapping, unlock_page(page); break; } - if (PageWriteback(page)) { + if (PageWriteback(page) || + PagePrivate2(page) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { doutc(cl, "%p under writeback\n", page); unlock_page(page); @@ -1070,6 +1084,7 @@ static int ceph_writepages_start(struct address_space *mapping, } doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { @@ -1254,6 +1269,8 @@ static int ceph_writepages_start(struct address_space *mapping, } set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a688d4c75d99..424048f9ed1f 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -466,7 +466,7 @@ int netfs_write_begin(struct netfs_inode *ctx, if (!netfs_is_cache_enabled(ctx) && netfs_skip_folio_read(folio, pos, len, false)) { netfs_stat(&netfs_n_rh_write_zskip); - goto have_folio; + goto have_folio_no_wait; } rreq = netfs_alloc_request(mapping, file, @@ -507,6 +507,12 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: + if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; + } +have_folio_no_wait: *_folio = folio; _leave(" = 0"); return 0; diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c93851b98368..c179a1c73fa7 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -98,6 +98,146 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); } +/* + * [DEPRECATED] Deal with the completion of writing the data to the cache. We + * have to clear the PG_fscache bits on the folios involved and release the + * caller's ref. + * + * May be called in softirq mode and we inherit a ref from the caller. + */ +static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, + bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t unlocked = 0; + bool have_unlocked = false; + + rcu_read_lock(); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); + + xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + if (xas_retry(&xas, folio)) + continue; + + /* We might have multiple writes from the same huge + * folio, but we mustn't unlock a folio more than once. + */ + if (have_unlocked && folio->index <= unlocked) + continue; + unlocked = folio_next_index(folio) - 1; + trace_netfs_folio(folio, netfs_folio_trace_end_copy); + folio_end_private_2(folio); + have_unlocked = true; + } + } + + rcu_read_unlock(); + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) /* [DEPRECATED] */ +{ + struct netfs_io_subrequest *subreq = priv; + struct netfs_io_request *rreq = subreq->rreq; + + if (IS_ERR_VALUE(transferred_or_error)) { + netfs_stat(&netfs_n_rh_write_failed); + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_copy_to_cache); + } else { + netfs_stat(&netfs_n_rh_write_done); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); + + /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} + +/* + * [DEPRECATED] Perform any outstanding writes to the cache. We inherit a ref + * from the caller. + */ +static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct netfs_io_subrequest *subreq, *next, *p; + struct iov_iter iter; + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_copy); + + /* We don't want terminating writes trying to wake us up whilst we're + * still going through the list. + */ + atomic_inc(&rreq->nr_copy_ops); + + list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + list_del_init(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_no_copy); + } + } + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + /* Amalgamate adjacent writes */ + while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + next = list_next_entry(subreq, rreq_link); + if (next->start != subreq->start + subreq->len) + break; + subreq->len += next->len; + list_del_init(&next->rreq_link); + netfs_put_subrequest(next, false, + netfs_sreq_trace_put_merged); + } + + ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, + subreq->len, rreq->i_size, true); + if (ret < 0) { + trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); + trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); + continue; + } + + iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, + subreq->start, subreq->len); + + atomic_inc(&rreq->nr_copy_ops); + netfs_stat(&netfs_n_rh_write); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); + trace_netfs_sreq(subreq, netfs_sreq_trace_write); + cres->ops->write(cres, subreq->start, &iter, + netfs_rreq_copy_terminated, subreq); + } + + /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, false); +} + +static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */ +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + + netfs_rreq_do_write_to_cache(rreq); +} + +static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */ +{ + rreq->work.func = netfs_rreq_write_to_cache_work; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); +} + /* * Handle a short read. */ @@ -275,6 +415,10 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) && + test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) + return netfs_rreq_write_to_cache(rreq); + netfs_rreq_completed(rreq, was_async); } diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index da23484268df..24ec3434d32e 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -145,6 +145,7 @@ EM(netfs_folio_trace_clear_g, "clear-g") \ EM(netfs_folio_trace_clear_s, "clear-s") \ EM(netfs_folio_trace_copy_to_cache, "mark-copy") \ + EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ EM(netfs_folio_trace_kill, "kill") \ EM(netfs_folio_trace_kill_cc, "kill-cc") \ From 7b589a9b45ae32aa9d7bece597490e141198d7a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Aug 2024 19:38:46 +0100 Subject: [PATCH 0764/1052] netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags The NETFS_RREQ_USE_PGPRIV2 and NETFS_RREQ_WRITE_TO_CACHE flags aren't used correctly. The problem is that we try to set them up in the request initialisation, but we the cache may be in the process of setting up still, and so the state may not be correct. Further, we secondarily sample the cache state and make contradictory decisions later. The issue arises because we set up the cache resources, which allows the cache's ->prepare_read() to switch on NETFS_SREQ_COPY_TO_CACHE - which triggers cache writing even if we didn't set the flags when allocating. Fix this in the following way: (1) Drop NETFS_ICTX_USE_PGPRIV2 and instead set NETFS_RREQ_USE_PGPRIV2 in ->init_request() rather than trying to juggle that in netfs_alloc_request(). (2) Repurpose NETFS_RREQ_USE_PGPRIV2 to merely indicate that if caching is to be done, then PG_private_2 is to be used rather than only setting it if we decide to cache and then having netfs_rreq_unlock_folios() set the non-PG_private_2 writeback-to-cache if it wasn't set. (3) Split netfs_rreq_unlock_folios() into two functions, one of which contains the deprecated code for using PG_private_2 to avoid accidentally doing the writeback path - and always use it if USE_PGPRIV2 is set. (4) As NETFS_ICTX_USE_PGPRIV2 is removed, make netfs_write_begin() always wait for PG_private_2. This function is deprecated and only used by ceph anyway, and so label it so. (5) Drop the NETFS_RREQ_WRITE_TO_CACHE flag and use fscache_operation_valid() on the cache_resources instead. This has the advantage of picking up the result of netfs_begin_cache_read() and fscache_begin_write_operation() - which are called after the object is initialised and will wait for the cache to come to a usable state. Just reverting ae678317b95e[1] isn't a sufficient fix, so this need to be applied on top of that. Without this as well, things like: rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { and: WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386 may happen, along with some UAFs due to PG_private_2 not getting used to wait on writeback completion. Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Reported-by: Max Kellermann Signed-off-by: David Howells cc: Ilya Dryomov cc: Xiubo Li cc: Hristo Venev cc: Jeff Layton cc: Matthew Wilcox cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk/ [1] Link: https://lore.kernel.org/r/1173209.1723152682@warthog.procyon.org.uk Signed-off-by: Christian Brauner --- fs/ceph/addr.c | 3 + fs/ceph/inode.c | 2 - fs/netfs/buffered_read.c | 125 ++++++++++++++++++++++++++++++----- fs/netfs/objects.c | 10 --- fs/netfs/write_issue.c | 4 +- fs/nfs/fscache.c | 2 + fs/nfs/fscache.h | 2 - include/linux/netfs.h | 3 - include/trace/events/netfs.h | 1 + 9 files changed, 116 insertions(+), 36 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 73b5a07bf94d..cc0a2240de98 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -424,6 +424,9 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) struct ceph_netfs_request_data *priv; int ret = 0; + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + if (rreq->origin != NETFS_READAHEAD) return 0; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f8de8f33abb..71cd70514efa 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) /* Set parameters for the netfs library */ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags); spin_lock_init(&ci->i_ceph_lock); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 424048f9ed1f..27c750d39476 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -9,6 +9,97 @@ #include #include "internal.h" +/* + * [DEPRECATED] Unlock the folios in a read operation for when the filesystem + * is using PG_private_2 and direct writing to the cache from here rather than + * marking the page for writeback. + * + * Note that we don't touch folio->private in this code. + */ +static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq, + size_t *account) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + loff_t pg_end; + bool pg_failed = false; + bool folio_started = false; + + if (xas_retry(&xas, folio)) + continue; + + pg_end = folio_pos(folio) + folio_size(folio) - 1; + + for (;;) { + loff_t sreq_end; + + if (!subreq) { + pg_failed = true; + break; + } + + if (!folio_started && + test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) && + fscache_operation_valid(&rreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folio_started = true; + } + + pg_failed |= subreq_failed; + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) + break; + + *account += subreq->transferred; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + + if (pg_end == sreq_end) + break; + } + + if (!pg_failed) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) + _debug("no unlock"); + else + folio_unlock(folio); + } + } + rcu_read_unlock(); +} + /* * Unlock the folios in a read operation. We need to set PG_writeback on any * folios we're going to write back before we unlock them. @@ -35,6 +126,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } } + /* Handle deprecated PG_private_2 case. */ + if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + netfs_rreq_unlock_folios_pgpriv2(rreq, &account); + goto out; + } + /* Walk through the pagecache and the I/O request lists simultaneously. * We may have a mixture of cached and uncached sections and we only * really want to write out the uncached sections. This is slightly @@ -52,7 +149,6 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) loff_t pg_end; bool pg_failed = false; bool wback_to_cache = false; - bool folio_started = false; if (xas_retry(&xas, folio)) continue; @@ -66,17 +162,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { - if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, - &subreq->flags)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folio_started = true; - } - } else { - wback_to_cache |= - test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } + + wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) @@ -124,6 +211,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } rcu_read_unlock(); +out: task_io_account_read(account); if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -395,7 +483,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, } /** - * netfs_write_begin - Helper to prepare for writing + * netfs_write_begin - Helper to prepare for writing [DEPRECATED] * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from @@ -426,6 +514,9 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, * inode before calling this. * * This is usable whether or not caching is enabled. + * + * Note that this should be considered deprecated and netfs_perform_write() + * used instead. */ int netfs_write_begin(struct netfs_inode *ctx, struct file *file, struct address_space *mapping, @@ -507,11 +598,9 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { - ret = folio_wait_private_2_killable(folio); - if (ret < 0) - goto error; - } + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; have_folio_no_wait: *_folio = folio; _leave(" = 0"); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index f4a642727479..0294df70c3ff 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -24,10 +24,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct netfs_io_request *rreq; mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool; struct kmem_cache *cache = mempool->pool_data; - bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || - origin == NETFS_DIO_READ || - origin == NETFS_DIO_WRITE); - bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; for (;;) { @@ -56,12 +52,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, refcount_set(&rreq->ref, 1); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (cached) { - __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) - /* Filesystem uses deprecated PG_private_2 marking. */ - __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); - } if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 9258d30cffe3..3f7e37e50c7d 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -94,6 +94,8 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, { struct netfs_io_request *wreq; struct netfs_inode *ictx; + bool is_buffered = (origin == NETFS_WRITEBACK || + origin == NETFS_WRITETHROUGH); wreq = netfs_alloc_request(mapping, file, start, 0, origin); if (IS_ERR(wreq)) @@ -102,7 +104,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + if (is_buffered && netfs_is_cache_enabled(ictx)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); wreq->contiguity = wreq->start; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 7202ce84d0eb..bf29a65c5027 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -265,6 +265,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi { rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); return 0; } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index fbed0027996f..e8adae1bc260 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -81,8 +81,6 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d0288938cc2..983816608f15 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,8 +73,6 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ -#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark - * write to cache on read */ }; /* @@ -269,7 +267,6 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ -#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 24ec3434d32e..606b4a0f92da 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -51,6 +51,7 @@ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ + EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \ From 4a4c013d3385b0db85dc361203dc42ff048b6fd6 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Sat, 10 Aug 2024 10:35:04 +0100 Subject: [PATCH 0765/1052] libbpf: Fix license for btf_relocate.c License should be // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) ...as with other libbpf files. Fixes: 19e00c897d50 ("libbpf: Split BTF relocation") Reported-by: Neill Kapron Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240810093504.2111134-1-alan.maguire@oracle.com --- tools/lib/bpf/btf_relocate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf_relocate.c b/tools/lib/bpf/btf_relocate.c index 17f8b32f94a0..4f7399d85eab 100644 --- a/tools/lib/bpf/btf_relocate.c +++ b/tools/lib/bpf/btf_relocate.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2024, Oracle and/or its affiliates. */ #ifndef _GNU_SOURCE From 6b9935da2a6b2a72774c15c844ae201a3fc362ac Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Sat, 10 Aug 2024 13:27:00 +0900 Subject: [PATCH 0766/1052] scsi: mpi3mr: Add missing spin_lock_init() for mrioc->trigger_lock Commit fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") added the spinlock trigger_lock to the struct mpi3mr_ioc. However, spin_lock_init() call was not added for it, then the lock does not work as expected. Also, the kernel reports the message below when lockdep is enabled. INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? To fix the issue and to avoid the INFO message, add the missing spin_lock_init() call. Fixes: fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240810042701.661841-2-shinichiro.kawasaki@wdc.com Acked-by: Sathya Prakash Veerichetty Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_os.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index ca8f132e03ae..616894571c6a 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -5234,6 +5234,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&mrioc->watchdog_lock); spin_lock_init(&mrioc->chain_buf_lock); spin_lock_init(&mrioc->sas_node_lock); + spin_lock_init(&mrioc->trigger_lock); INIT_LIST_HEAD(&mrioc->fwevt_list); INIT_LIST_HEAD(&mrioc->tgtdev_list); From fdad456cbcca739bae1849549c7a999857c56f88 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 28 Jul 2024 19:46:11 +0800 Subject: [PATCH 0767/1052] bpf: Fix updating attached freplace prog in prog_array map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit f7866c358733 ("bpf: Fix null pointer dereference in resolve_prog_type() for BPF_PROG_TYPE_EXT") fixed a NULL pointer dereference panic, but didn't fix the issue that fails to update attached freplace prog to prog_array map. Since commit 1c123c567fb1 ("bpf: Resolve fext program type when checking map compatibility"), freplace prog and its target prog are able to tail call each other. And the commit 3aac1ead5eb6 ("bpf: Move prog->aux->linked_prog and trampoline into bpf_link on attach") sets prog->aux->dst_prog as NULL after attaching freplace prog to its target prog. After loading freplace the prog_array's owner type is BPF_PROG_TYPE_SCHED_CLS. Then, after attaching freplace its prog->aux->dst_prog is NULL. Then, while updating freplace in prog_array the bpf_prog_map_compatible() incorrectly returns false because resolve_prog_type() returns BPF_PROG_TYPE_EXT instead of BPF_PROG_TYPE_SCHED_CLS. After this patch the resolve_prog_type() returns BPF_PROG_TYPE_SCHED_CLS and update to prog_array can succeed. Fixes: f7866c358733 ("bpf: Fix null pointer dereference in resolve_prog_type() for BPF_PROG_TYPE_EXT") Cc: Toke Høiland-Jørgensen Cc: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240728114612.48486-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6503c85b10a3..7b776dae36e5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -856,8 +856,8 @@ static inline u32 type_flag(u32 type) /* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { - return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->dst_prog) ? - prog->aux->dst_prog->type : prog->type; + return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->saved_dst_prog_type) ? + prog->aux->saved_dst_prog_type : prog->type; } static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) From 6c17ea1f3eaa330d445ac14a9428402ce4e3055e Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A" Date: Wed, 31 Jul 2024 08:31:12 +0530 Subject: [PATCH 0768/1052] cpu/SMT: Enable SMT only if a core is online If a core is offline then enabling SMT should not online CPUs of this core. By enabling SMT, what is intended is either changing the SMT value from "off" to "on" or setting the SMT level (threads per core) from a lower to higher value. On PowerPC the ppc64_cpu utility can be used, among other things, to perform the following functions: ppc64_cpu --cores-on # Get the number of online cores ppc64_cpu --cores-on=X # Put exactly X cores online ppc64_cpu --offline-cores=X[,Y,...] # Put specified cores offline ppc64_cpu --smt={on|off|value} # Enable, disable or change SMT level If the user has decided to offline certain cores, enabling SMT should not online CPUs in those cores. This patch fixes the issue and changes the behaviour as described, by introducing an arch specific function topology_is_core_online(). It is currently implemented only for PowerPC. Fixes: 73c58e7e1412 ("powerpc: Add HOTPLUG_SMT support") Reported-by: Tyrel Datwyler Closes: https://groups.google.com/g/powerpc-utils-devel/c/wrwVzAAnRlI/m/5KJSoqP4BAAJ Signed-off-by: Nysal Jan K.A Reviewed-by: Shrikanth Hegde Reviewed-by: Thomas Gleixner Signed-off-by: Michael Ellerman Link: https://msgid.link/20240731030126.956210-2-nysal@linux.ibm.com --- Documentation/ABI/testing/sysfs-devices-system-cpu | 3 ++- kernel/cpu.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 325873385b71..de725ca3be82 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -562,7 +562,8 @@ Description: Control Symmetric Multi Threading (SMT) ================ ========================================= If control status is "forceoff" or "notsupported" writes - are rejected. + are rejected. Note that enabling SMT on PowerPC skips + offline cores. What: /sys/devices/system/cpu/cpuX/power/energy_perf_bias Date: March 2019 diff --git a/kernel/cpu.c b/kernel/cpu.c index 1209ddaec026..b1fd2a3db91a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2689,6 +2689,16 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } +/** + * Check if the core a CPU belongs to is online + */ +#if !defined(topology_is_core_online) +static inline bool topology_is_core_online(unsigned int cpu) +{ + return true; +} +#endif + int cpuhp_smt_enable(void) { int cpu, ret = 0; @@ -2699,7 +2709,7 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; - if (!cpu_smt_thread_allowed(cpu)) + if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu)) continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) From 227bbaabe64b6f9cd98aa051454c1d4a194a8c6a Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A" Date: Wed, 31 Jul 2024 08:31:13 +0530 Subject: [PATCH 0769/1052] powerpc/topology: Check if a core is online topology_is_core_online() checks if the core a CPU belongs to is online. The core is online if at least one of the sibling CPUs is online. The first CPU of an online core is also online in the common case, so this should be fairly quick. Fixes: 73c58e7e1412 ("powerpc: Add HOTPLUG_SMT support") Signed-off-by: Nysal Jan K.A Reviewed-by: Shrikanth Hegde Signed-off-by: Michael Ellerman Link: https://msgid.link/20240731030126.956210-3-nysal@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index f4e6f2dd04b7..16bacfe8c7a2 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -145,6 +145,7 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_HOTPLUG_SMT #include +#include #include static inline bool topology_is_primary_thread(unsigned int cpu) @@ -156,6 +157,18 @@ static inline bool topology_smt_thread_allowed(unsigned int cpu) { return cpu_thread_in_core(cpu) < cpu_smt_num_threads; } + +#define topology_is_core_online topology_is_core_online +static inline bool topology_is_core_online(unsigned int cpu) +{ + int i, first_cpu = cpu_first_thread_sibling(cpu); + + for (i = first_cpu; i < first_cpu + threads_per_core; ++i) { + if (cpu_online(i)) + return true; + } + return false; +} #endif #endif /* __KERNEL__ */ From 8c6b808c8c2a9de21503944bd6308979410fd812 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Sat, 10 Aug 2024 13:27:01 +0900 Subject: [PATCH 0770/1052] scsi: mpi3mr: Avoid MAX_PAGE_ORDER WARNING for buffer allocations Commit fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") added mpi3mr_alloc_diag_bufs() which calls dma_alloc_coherent() to allocate the trace buffer and the firmware buffer. mpi3mr_alloc_diag_bufs() decides the buffer sizes from the driver configuration. In my environment, the sizes are 8MB. With the sizes, dma_alloc_coherent() fails and report this WARNING: WARNING: CPU: 4 PID: 438 at mm/page_alloc.c:4676 __alloc_pages_noprof+0x52f/0x640 The WARNING indicates that the order of the allocation size is larger than MAX_PAGE_ORDER. After this failure, mpi3mr_alloc_diag_bufs() reduces the buffer sizes and retries dma_alloc_coherent(). In the end, the buffer allocations succeed with 4MB size in my environment, which corresponds to MAX_PAGE_ORDER=10. Though the allocations succeed, the WARNING message is misleading and should be avoided. To avoid the WARNING, check the orders of the buffer allocation sizes before calling dma_alloc_coherent(). If the orders are larger than MAX_PAGE_ORDER, fall back to the retry path. Fixes: fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240810042701.661841-3-shinichiro.kawasaki@wdc.com Acked-by: Sathya Prakash Veerichetty Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_app.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c index 8b0eded6ef36..01f035f9330e 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_app.c +++ b/drivers/scsi/mpi3mr/mpi3mr_app.c @@ -100,7 +100,8 @@ void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc) dprint_init(mrioc, "trying to allocate trace diag buffer of size = %dKB\n", trace_size / 1024); - if (mpi3mr_alloc_trace_buffer(mrioc, trace_size)) { + if (get_order(trace_size) > MAX_PAGE_ORDER || + mpi3mr_alloc_trace_buffer(mrioc, trace_size)) { retry = true; trace_size -= trace_dec_size; dprint_init(mrioc, "trace diag buffer allocation failed\n" @@ -118,8 +119,12 @@ void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc) diag_buffer->type = MPI3_DIAG_BUFFER_TYPE_FW; diag_buffer->status = MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED; if ((mrioc->facts.diag_fw_sz < fw_size) && (fw_size >= fw_min_size)) { - diag_buffer->addr = dma_alloc_coherent(&mrioc->pdev->dev, - fw_size, &diag_buffer->dma_addr, GFP_KERNEL); + if (get_order(fw_size) <= MAX_PAGE_ORDER) { + diag_buffer->addr + = dma_alloc_coherent(&mrioc->pdev->dev, fw_size, + &diag_buffer->dma_addr, + GFP_KERNEL); + } if (!retry) dprint_init(mrioc, "%s:trying to allocate firmware diag buffer of size = %dKB\n", From bed2eb964c70b780fb55925892a74f26cb590b25 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Aug 2024 14:48:47 -0700 Subject: [PATCH 0771/1052] bpf: Fix a kernel verifier crash in stacksafe() Daniel Hodges reported a kernel verifier crash when playing with sched-ext. Further investigation shows that the crash is due to invalid memory access in stacksafe(). More specifically, it is the following code: if (exact != NOT_EXACT && old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) return false; The 'i' iterates old->allocated_stack. If cur->allocated_stack < old->allocated_stack the out-of-bound access will happen. To fix the issue add 'i >= cur->allocated_stack' check such that if the condition is true, stacksafe() should fail. Otherwise, cur->stack[spi].slot_type[i % BPF_REG_SIZE] memory access is legal. Fixes: 2793a8b015f7 ("bpf: exact states comparison for iterator convergence checks") Cc: Eduard Zingerman Reported-by: Daniel Hodges Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240812214847.213612-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cb5441ad75f..d8520095ca03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16884,8 +16884,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; if (exact != NOT_EXACT && - old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + (i >= cur->allocated_stack || + old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE])) return false; if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) From 662c3e2db00f92e50c26e9dc4fe47c52223d9982 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Aug 2024 14:48:52 -0700 Subject: [PATCH 0772/1052] selftests/bpf: Add a test to verify previous stacksafe() fix A selftest is added such that without the previous patch, a crash can happen. With the previous patch, the test can run successfully. The new test is written in a way which mimics original crash case: main_prog static_prog_1 static_prog_2 where static_prog_1 has different paths to static_prog_2 and some path has stack allocated and some other path does not. A stacksafe() checking in static_prog_2() triggered the crash. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240812214852.214037-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 54 +++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 16bdc3e25591..ef70b88bccb2 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1432,4 +1432,58 @@ int iter_arr_with_actual_elem_count(const void *ctx) return sum; } +__u32 upper, select_n, result; +__u64 global; + +static __noinline bool nest_2(char *str) +{ + /* some insns (including branch insns) to ensure stacksafe() is triggered + * in nest_2(). This way, stacksafe() can compare frame associated with nest_1(). + */ + if (str[0] == 't') + return true; + if (str[1] == 'e') + return true; + if (str[2] == 's') + return true; + if (str[3] == 't') + return true; + return false; +} + +static __noinline bool nest_1(int n) +{ + /* case 0: allocate stack, case 1: no allocate stack */ + switch (n) { + case 0: { + char comm[16]; + + if (bpf_get_current_comm(comm, 16)) + return false; + return nest_2(comm); + } + case 1: + return nest_2((char *)&global); + default: + return false; + } +} + +SEC("raw_tp") +__success +int iter_subprog_check_stacksafe(const void *ctx) +{ + long i; + + bpf_for(i, 0, upper) { + if (!nest_1(select_n)) { + result = 1; + return 0; + } + } + + result = 2; + return 0; +} + char _license[] SEC("license") = "GPL"; From 6e1918ff680527ce4be77426aa537012b5aa997c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 7 Aug 2024 21:00:21 -0700 Subject: [PATCH 0773/1052] net: macb: Use rcu_dereference() for idev->ifa_list in macb_suspend(). In macb_suspend(), idev->ifa_list is fetched with rcu_access_pointer() and later the pointer is dereferenced as ifa->ifa_local. So, idev->ifa_list must be fetched with rcu_dereference(). Fixes: 0cb8de39a776 ("net: macb: Add ARP support to WOL") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240808040021.6971-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 11665be3a22c..dcd3f54ed0cf 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5250,8 +5250,8 @@ static int __maybe_unused macb_suspend(struct device *dev) if (bp->wol & MACB_WOL_ENABLED) { /* Check for IP address in WOL ARP mode */ idev = __in_dev_get_rcu(bp->dev); - if (idev && idev->ifa_list) - ifa = rcu_access_pointer(idev->ifa_list); + if (idev) + ifa = rcu_dereference(idev->ifa_list); if ((bp->wolopts & WAKE_ARP) && !ifa) { netdev_err(netdev, "IP address not assigned as required by WoL walk ARP\n"); return -EOPNOTSUPP; From 046667c4d3196938e992fba0dfcde570aa85cd0e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 21 Jul 2024 14:45:08 -0400 Subject: [PATCH 0774/1052] memcg_write_event_control(): fix a user-triggerable oops we are *not* guaranteed that anything past the terminating NUL is mapped (let alone initialized with anything sane). Fixes: 0dea116876ee ("cgroup: implement eventfd-based generic API for notifications") Cc: stable@vger.kernel.org Cc: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Al Viro --- mm/memcontrol-v1.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 2aeea4d8bf8e..417c96f2da28 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1842,9 +1842,12 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, buf = endp + 1; cfd = simple_strtoul(buf, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) + if (*endp == '\0') + buf = endp; + else if (*endp == ' ') + buf = endp + 1; + else return -EINVAL; - buf = endp + 1; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) From 3beddef84d90590270465a907de1cfe2539ac70d Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Tue, 13 Aug 2024 12:37:48 +0800 Subject: [PATCH 0775/1052] ALSA: hda/tas2781: fix wrong calibrated data order Wrong calibration data order cause sound too low in some device. Fix wrong calibrated data order, add calibration data converssion by get_unaligned_be32() after reading from UEFI. Fixes: 5be27f1e3ec9 ("ALSA: hda/tas2781: Add tas2781 HDA driver") Cc: Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20240813043749.108-1-shenghao-ding@ti.com Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 49bd7097d892..7dbfc92d9d55 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -2,10 +2,12 @@ // // TAS2781 HDA I2C driver // -// Copyright 2023 Texas Instruments, Inc. +// Copyright 2023 - 2024 Texas Instruments, Inc. // // Author: Shenghao Ding +// Current maintainer: Baojun Xu +#include #include #include #include @@ -519,20 +521,22 @@ static void tas2781_apply_calib(struct tasdevice_priv *tas_priv) static const unsigned char rgno_array[CALIB_MAX] = { 0x74, 0x0c, 0x14, 0x70, 0x7c, }; - unsigned char *data; + int offset = 0; int i, j, rc; + __be32 data; for (i = 0; i < tas_priv->ndev; i++) { - data = tas_priv->cali_data.data + - i * TASDEVICE_SPEAKER_CALIBRATION_SIZE; for (j = 0; j < CALIB_MAX; j++) { + data = get_unaligned_be32( + &tas_priv->cali_data.data[offset]); rc = tasdevice_dev_bulk_write(tas_priv, i, TASDEVICE_REG(0, page_array[j], rgno_array[j]), - &(data[4 * j]), 4); + (unsigned char *)&data, 4); if (rc < 0) dev_err(tas_priv->dev, "chn %d calib %d bulk_wr err = %d\n", i, j, rc); + offset += 4; } } } From d2dfed310aae0739dc87b68c660357e6a4f29819 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Tue, 6 Aug 2024 11:46:03 +1200 Subject: [PATCH 0776/1052] platform/x86: asus-wmi: Add quirk for ROG Ally X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new ROG Ally X functions the same as the previus model so we can use the same method to ensure the MCU USB devices wake and reconnect correctly. Given that two devices marks the start of a trend, this patch also adds a quirk table to make future additions easier if the MCU is the same. Signed-off-by: Luke D. Jones Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240805234603.38736-1-luke@ljones.dev Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index cc735931f97b..37636e5a38e3 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -146,6 +146,20 @@ static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL }; static int throttle_thermal_policy_write(struct asus_wmi *); +static const struct dmi_system_id asus_ally_mcu_quirk[] = { + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71L"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC72L"), + }, + }, + { }, +}; + static bool ashs_present(void) { int i = 0; @@ -4685,7 +4699,7 @@ static int asus_wmi_add(struct platform_device *pdev) asus->dgpu_disable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_DGPU); asus->kbd_rgb_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_STATE); asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE) - && dmi_match(DMI_BOARD_NAME, "RC71L"); + && dmi_check_system(asus_ally_mcu_quirk); if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE)) asus->mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE; From 9c8e022567bbec53bee8ae75c44b3d6cd2080d42 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:44 +0200 Subject: [PATCH 0777/1052] platform/surface: aggregator_registry: Add Support for Surface Pro 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Pro 10. It seems to use the same SAM client devices as the Surface Pro 9, so re-use its node group. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-2-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/surface_aggregator_registry.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 1c4d74db08c9..fa5b896e5f4e 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -324,7 +324,7 @@ static const struct software_node *ssam_node_group_sp8[] = { NULL, }; -/* Devices for Surface Pro 9 */ +/* Devices for Surface Pro 9 and 10 */ static const struct software_node *ssam_node_group_sp9[] = { &ssam_node_root, &ssam_node_hub_kip, @@ -365,6 +365,9 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Pro 9 */ { "MSHW0343", (unsigned long)ssam_node_group_sp9 }, + /* Surface Pro 10 */ + { "MSHW0510", (unsigned long)ssam_node_group_sp9 }, + /* Surface Book 2 */ { "MSHW0107", (unsigned long)ssam_node_group_gen5 }, From ed235163c3f02329d5e37ed4485bbc39ed2568d4 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:45 +0200 Subject: [PATCH 0778/1052] platform/surface: aggregator_registry: Add support for Surface Laptop Go 3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Laptop Go 3. It seems to use the same SAM client devices as the Surface Laptop Go 1 and 2, so re-use their node group. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-3-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/surface_aggregator_registry.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index fa5b896e5f4e..4d36810c2308 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -398,6 +398,9 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Laptop Go 2 */ { "MSHW0290", (unsigned long)ssam_node_group_slg1 }, + /* Surface Laptop Go 3 */ + { "MSHW0440", (unsigned long)ssam_node_group_slg1 }, + /* Surface Laptop Studio */ { "MSHW0123", (unsigned long)ssam_node_group_sls }, From 28d04b4a2cc20981c95787f9c449e6fc51d904f9 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:46 +0200 Subject: [PATCH 0779/1052] platform/surface: aggregator_registry: Add support for Surface Laptop Studio 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Laptop Studio 2 (SLS2). The SLS2 is quite similar to the SLS1, but it does not provide the touchpad as a SAM-HID device. Therefore, add a new node group for the SLS2 and update the comments accordingly. In addition, it uses the new fan control interface. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-4-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../surface/surface_aggregator_registry.c | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 4d36810c2308..892ba9549f6a 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -273,8 +273,8 @@ static const struct software_node *ssam_node_group_sl5[] = { NULL, }; -/* Devices for Surface Laptop Studio. */ -static const struct software_node *ssam_node_group_sls[] = { +/* Devices for Surface Laptop Studio 1. */ +static const struct software_node *ssam_node_group_sls1[] = { &ssam_node_root, &ssam_node_bat_ac, &ssam_node_bat_main, @@ -289,6 +289,22 @@ static const struct software_node *ssam_node_group_sls[] = { NULL, }; +/* Devices for Surface Laptop Studio 2. */ +static const struct software_node *ssam_node_group_sls2[] = { + &ssam_node_root, + &ssam_node_bat_ac, + &ssam_node_bat_main, + &ssam_node_tmp_perf_profile_with_fan, + &ssam_node_tmp_sensors, + &ssam_node_fan_speed, + &ssam_node_pos_tablet_switch, + &ssam_node_hid_sam_keyboard, + &ssam_node_hid_sam_penstash, + &ssam_node_hid_sam_sensors, + &ssam_node_hid_sam_ucm_ucsi, + NULL, +}; + /* Devices for Surface Laptop Go. */ static const struct software_node *ssam_node_group_slg1[] = { &ssam_node_root, @@ -401,8 +417,11 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Laptop Go 3 */ { "MSHW0440", (unsigned long)ssam_node_group_slg1 }, - /* Surface Laptop Studio */ - { "MSHW0123", (unsigned long)ssam_node_group_sls }, + /* Surface Laptop Studio 1 */ + { "MSHW0123", (unsigned long)ssam_node_group_sls1 }, + + /* Surface Laptop Studio 2 */ + { "MSHW0360", (unsigned long)ssam_node_group_sls2 }, { }, }; From 002adda09bc1c983c75c82a7e12285c7423aec31 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:47 +0200 Subject: [PATCH 0780/1052] platform/surface: aggregator_registry: Add fan and thermal sensor support for Surface Laptop 5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EC on the Surface Laptop 5 exposes the fan interface. With the recently introduced driver for it, we can now also enable it here. In addition, also enable the thermal sensor interface. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-5-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/surface_aggregator_registry.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 892ba9549f6a..4d3f5b3111ba 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -265,7 +265,9 @@ static const struct software_node *ssam_node_group_sl5[] = { &ssam_node_root, &ssam_node_bat_ac, &ssam_node_bat_main, - &ssam_node_tmp_perf_profile, + &ssam_node_tmp_perf_profile_with_fan, + &ssam_node_tmp_sensors, + &ssam_node_fan_speed, &ssam_node_hid_main_keyboard, &ssam_node_hid_main_touchpad, &ssam_node_hid_main_iid5, From 99ae7b9ba047ad029a0a23b2bd51608ce79c8e97 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:48 +0200 Subject: [PATCH 0781/1052] platform/surface: aggregator_registry: Add support for Surface Laptop 6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Laptop Studio 6 (SL6). The SL6 is similar to the SL5, with the typical battery/AC, platform profile, and HID nodes. It also has support for the newly supported fan interface. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-6-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../surface/surface_aggregator_registry.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 4d3f5b3111ba..a23dff35f8ca 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -275,6 +275,22 @@ static const struct software_node *ssam_node_group_sl5[] = { NULL, }; +/* Devices for Surface Laptop 6. */ +static const struct software_node *ssam_node_group_sl6[] = { + &ssam_node_root, + &ssam_node_bat_ac, + &ssam_node_bat_main, + &ssam_node_tmp_perf_profile_with_fan, + &ssam_node_tmp_sensors, + &ssam_node_fan_speed, + &ssam_node_hid_main_keyboard, + &ssam_node_hid_main_touchpad, + &ssam_node_hid_main_iid5, + &ssam_node_hid_sam_sensors, + &ssam_node_hid_sam_ucm_ucsi, + NULL, +}; + /* Devices for Surface Laptop Studio 1. */ static const struct software_node *ssam_node_group_sls1[] = { &ssam_node_root, @@ -410,6 +426,9 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Laptop 5 */ { "MSHW0350", (unsigned long)ssam_node_group_sl5 }, + /* Surface Laptop 6 */ + { "MSHW0530", (unsigned long)ssam_node_group_sl6 }, + /* Surface Laptop Go 1 */ { "MSHW0118", (unsigned long)ssam_node_group_slg1 }, From ccbde4b128ef9c73d14d0d7817d68ef795f6d131 Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Thu, 1 Aug 2024 15:11:26 +0300 Subject: [PATCH 0782/1052] char: xillybus: Don't destroy workqueue from work item running on it Triggered by a kref decrement, destroy_workqueue() may be called from within a work item for destroying its own workqueue. This illegal situation is averted by adding a module-global workqueue for exclusive use of the offending work item. Other work items continue to be queued on per-device workqueues to ensure performance. Reported-by: syzbot+91dbdfecdd3287734d8e@syzkaller.appspotmail.com Cc: stable Closes: https://lore.kernel.org/lkml/0000000000000ab25a061e1dfe9f@google.com/ Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20240801121126.60183-1-eli.billauer@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillyusb.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/char/xillybus/xillyusb.c b/drivers/char/xillybus/xillyusb.c index 5a5afa14ca8c..33ca0f4af390 100644 --- a/drivers/char/xillybus/xillyusb.c +++ b/drivers/char/xillybus/xillyusb.c @@ -50,6 +50,7 @@ MODULE_LICENSE("GPL v2"); static const char xillyname[] = "xillyusb"; static unsigned int fifo_buf_order; +static struct workqueue_struct *wakeup_wq; #define USB_VENDOR_ID_XILINX 0x03fd #define USB_VENDOR_ID_ALTERA 0x09fb @@ -569,10 +570,6 @@ static void cleanup_dev(struct kref *kref) * errors if executed. The mechanism relies on that xdev->error is assigned * a non-zero value by report_io_error() prior to queueing wakeup_all(), * which prevents bulk_in_work() from calling process_bulk_in(). - * - * The fact that wakeup_all() and bulk_in_work() are queued on the same - * workqueue makes their concurrent execution very unlikely, however the - * kernel's API doesn't seem to ensure this strictly. */ static void wakeup_all(struct work_struct *work) @@ -627,7 +624,7 @@ static void report_io_error(struct xillyusb_dev *xdev, if (do_once) { kref_get(&xdev->kref); /* xdev is used by work item */ - queue_work(xdev->workq, &xdev->wakeup_workitem); + queue_work(wakeup_wq, &xdev->wakeup_workitem); } } @@ -2258,6 +2255,10 @@ static int __init xillyusb_init(void) { int rc = 0; + wakeup_wq = alloc_workqueue(xillyname, 0, 0); + if (!wakeup_wq) + return -ENOMEM; + if (LOG2_INITIAL_FIFO_BUF_SIZE > PAGE_SHIFT) fifo_buf_order = LOG2_INITIAL_FIFO_BUF_SIZE - PAGE_SHIFT; else @@ -2265,11 +2266,16 @@ static int __init xillyusb_init(void) rc = usb_register(&xillyusb_driver); + if (rc) + destroy_workqueue(wakeup_wq); + return rc; } static void __exit xillyusb_exit(void) { + destroy_workqueue(wakeup_wq); + usb_deregister(&xillyusb_driver); } From bc923d594db21bee0ead128eb4bb78f7e77467a4 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 14:46:44 +0200 Subject: [PATCH 0783/1052] platform/surface: aggregator: Fix warning when controller is destroyed in probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a small window in ssam_serial_hub_probe() where the controller is initialized but has not been started yet. Specifically, between ssam_controller_init() and ssam_controller_start(). Any failure in this window, for example caused by a failure of serdev_device_open(), currently results in an incorrect warning being emitted. In particular, any failure in this window results in the controller being destroyed via ssam_controller_destroy(). This function checks the state of the controller and, in an attempt to validate that the controller has been cleanly shut down before we try and deallocate any resources, emits a warning if that state is not SSAM_CONTROLLER_STOPPED. However, since we have only just initialized the controller and have not yet started it, its state is SSAM_CONTROLLER_INITIALIZED. Note that this is the only point at which the controller has this state, as it will change after we start the controller with ssam_controller_start() and never revert back. Further, at this point no communication has taken place and the sender and receiver threads have not been started yet (and we may not even have an open serdev device either). Therefore, it is perfectly safe to call ssam_controller_destroy() with a state of SSAM_CONTROLLER_INITIALIZED. This, however, means that the warning currently being emitted is incorrect. Fix it by extending the check. Fixes: c167b9c7e3d6 ("platform/surface: Add Surface Aggregator subsystem") Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811124645.246016-1-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/aggregator/controller.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c index 7fc602e01487..7e89f547999b 100644 --- a/drivers/platform/surface/aggregator/controller.c +++ b/drivers/platform/surface/aggregator/controller.c @@ -1354,7 +1354,8 @@ void ssam_controller_destroy(struct ssam_controller *ctrl) if (ctrl->state == SSAM_CONTROLLER_UNINITIALIZED) return; - WARN_ON(ctrl->state != SSAM_CONTROLLER_STOPPED); + WARN_ON(ctrl->state != SSAM_CONTROLLER_STOPPED && + ctrl->state != SSAM_CONTROLLER_INITIALIZED); /* * Note: New events could still have been received after the previous From dcdb52d948f3a17ccd3fce757d9bd981d7c32039 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Aug 2024 15:44:07 +0300 Subject: [PATCH 0784/1052] usb: xhci: Check for xhci->interrupters being allocated in xhci_mem_clearup() If xhci_mem_init() fails, it calls into xhci_mem_cleanup() to mop up the damage. If it fails early enough, before xhci->interrupters is allocated but after xhci->max_interrupters has been set, which happens in most (all?) cases, things get uglier, as xhci_mem_cleanup() unconditionally derefences xhci->interrupters. With prejudice. Gate the interrupt freeing loop with a check on xhci->interrupters being non-NULL. Found while debugging a DMA allocation issue that led the XHCI driver on this exact path. Fixes: c99b38c41234 ("xhci: add support to allocate several interrupters") Cc: Mathias Nyman Cc: Wesley Cheng Cc: Greg Kroah-Hartman Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org # 6.8+ Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240809124408.505786-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index d7654f475daf..937ce5fd5809 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1872,7 +1872,7 @@ void xhci_mem_cleanup(struct xhci_hcd *xhci) cancel_delayed_work_sync(&xhci->cmd_timer); - for (i = 0; i < xhci->max_interrupters; i++) { + for (i = 0; xhci->interrupters && i < xhci->max_interrupters; i++) { if (xhci->interrupters[i]) { xhci_remove_interrupter(xhci, xhci->interrupters[i]); xhci_free_interrupter(xhci, xhci->interrupters[i]); From 741b41b48faf41c0bcf3c26fbb3448b0fda4fc5d Mon Sep 17 00:00:00 2001 From: Niklas Neronin Date: Fri, 9 Aug 2024 15:44:08 +0300 Subject: [PATCH 0785/1052] usb: xhci: fix duplicate stall handling in handle_tx_event() Stall handling is managed in the 'process_*' functions, which are called right before the 'goto' stall handling code snippet. Thus, there should be a return after the 'process_*' functions. Otherwise, the stall code may run twice. Fixes: 1b349f214ac7 ("usb: xhci: add 'goto' for halted endpoint check in handle_tx_event()") Reported-by: Michal Pecio Signed-off-by: Niklas Neronin Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240809124408.505786-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index b7517c3c8059..4ea2c3e072a9 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2910,6 +2910,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, process_isoc_td(xhci, ep, ep_ring, td, ep_trb, event); else process_bulk_intr_td(xhci, ep, ep_ring, td, ep_trb, event); + return 0; check_endpoint_halted: if (xhci_halted_host_endpoint(ep_ctx, trb_comp_code)) From d209d1634e6562eafc369b28f8a1f67a2e9e5222 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 9 Aug 2024 18:03:43 +0300 Subject: [PATCH 0786/1052] usb: typec: ucsi: Fix the return value of ucsi_run_command() The command execution routines need to return the amount of data that was transferred when succesful. This fixes an issue where the alternate modes and the power delivery capabilities are not getting registered. Fixes: 5e9c1662a89b ("usb: typec: ucsi: rework command execution functions") Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240809150343.286942-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 432a2d6266d7..4039851551c1 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -137,7 +137,7 @@ static int ucsi_run_command(struct ucsi *ucsi, u64 command, u32 *cci, if (ret) return ret; - return err; + return err ?: UCSI_CCI_LENGTH(*cci); } static int ucsi_read_error(struct ucsi *ucsi, u8 connector_num) From 21ea1ce37fc267dc45fe27517bbde926211683df Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Fri, 9 Aug 2024 19:29:01 +0800 Subject: [PATCH 0787/1052] Revert "usb: typec: tcpm: clear pd_event queue in PORT_RESET" This reverts commit bf20c69cf3cf9c6445c4925dd9a8a6ca1b78bfdf. During tcpm_init() stage, if the VBUS is still present after tcpm_reset_port(), then we assume that VBUS will off and goto safe0v after a specific discharge time. Following a TCPM_VBUS_EVENT event if VBUS reach to off state. TCPM_VBUS_EVENT event may be set during PORT_RESET handling stage. If pd_events reset to 0 after TCPM_VBUS_EVENT set, we will lost this VBUS event. Then the port state machine may stuck at one state. Before: [ 2.570172] pending state change PORT_RESET -> PORT_RESET_WAIT_OFF @ 100 ms [rev1 NONE_AMS] [ 2.570179] state change PORT_RESET -> PORT_RESET_WAIT_OFF [delayed 100 ms] [ 2.570182] pending state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED @ 920 ms [rev1 NONE_AMS] [ 3.490213] state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED [delayed 920 ms] [ 3.490220] Start toggling [ 3.546050] CC1: 0 -> 0, CC2: 0 -> 2 [state TOGGLING, polarity 0, connected] [ 3.546057] state change TOGGLING -> SRC_ATTACH_WAIT [rev1 NONE_AMS] After revert this patch, we can see VBUS off event and the port will goto expected state. [ 2.441992] pending state change PORT_RESET -> PORT_RESET_WAIT_OFF @ 100 ms [rev1 NONE_AMS] [ 2.441999] state change PORT_RESET -> PORT_RESET_WAIT_OFF [delayed 100 ms] [ 2.442002] pending state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED @ 920 ms [rev1 NONE_AMS] [ 2.442122] VBUS off [ 2.442125] state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED [rev1 NONE_AMS] [ 2.442127] VBUS VSAFE0V [ 2.442351] CC1: 0 -> 0, CC2: 0 -> 0 [state SNK_UNATTACHED, polarity 0, disconnected] [ 2.442357] Start toggling [ 2.491850] CC1: 0 -> 0, CC2: 0 -> 2 [state TOGGLING, polarity 0, connected] [ 2.491858] state change TOGGLING -> SRC_ATTACH_WAIT [rev1 NONE_AMS] [ 2.491863] pending state change SRC_ATTACH_WAIT -> SNK_TRY @ 200 ms [rev1 NONE_AMS] [ 2.691905] state change SRC_ATTACH_WAIT -> SNK_TRY [delayed 200 ms] Fixes: bf20c69cf3cf ("usb: typec: tcpm: clear pd_event queue in PORT_RESET") Cc: stable@vger.kernel.org Signed-off-by: Xu Yang Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240809112901.535072-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index cce39818e99a..4b02d6474259 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5655,7 +5655,6 @@ static void run_state_machine(struct tcpm_port *port) break; case PORT_RESET: tcpm_reset_port(port); - port->pd_events = 0; if (port->self_powered) tcpm_set_cc(port, TYPEC_CC_OPEN); else From 3ed486e383ccee9b0c8d727608f12a937c6603ca Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 12 Aug 2024 11:50:38 +0200 Subject: [PATCH 0788/1052] usb: misc: ljca: Add Lunar Lake ljca GPIO HID to ljca_gpio_hids[] Add LJCA GPIO support for the Lunar Lake platform. New HID taken from out of tree ivsc-driver git repo. Link: https://github.com/intel/ivsc-driver/commit/47e7c4a446c8ea8c741ff5a32fa7b19f9e6fd47e Cc: stable Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240812095038.555837-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usb-ljca.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/misc/usb-ljca.c b/drivers/usb/misc/usb-ljca.c index 2d30fc1be306..1a8d5e80b9ae 100644 --- a/drivers/usb/misc/usb-ljca.c +++ b/drivers/usb/misc/usb-ljca.c @@ -169,6 +169,7 @@ static const struct acpi_device_id ljca_gpio_hids[] = { { "INTC1096" }, { "INTC100B" }, { "INTC10D1" }, + { "INTC10B5" }, {}, }; From 92567a5f92bc947fb7aa4351979db1b7b71a554c Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 8 Aug 2024 22:06:19 +0800 Subject: [PATCH 0789/1052] iommu: Remove unused declaration iommu_sva_unbind_gpasid() Commit 0c9f17877891 ("iommu: Remove guest pasid related interfaces and definitions") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240808140619.2498535-1-yuehaibing@huawei.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4d47f2c33311..04cbdae0052e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -795,8 +795,6 @@ extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); -extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, From dc98d76a15bc29a9a4e76f2f65f39f3e590fb15c Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 8 Aug 2024 22:03:25 +0800 Subject: [PATCH 0790/1052] tty: serial: fsl_lpuart: mark last busy before uart_add_one_port With "earlycon initcall_debug=1 loglevel=8" in bootargs, kernel sometimes boot hang. It is because normal console still is not ready, but runtime suspend is called, so early console putchar will hang in waiting TRDE set in UARTSTAT. The lpuart driver has auto suspend delay set to 3000ms, but during uart_add_one_port, a child device serial ctrl will added and probed with its pm runtime enabled(see serial_ctrl.c). The runtime suspend call path is: device_add |-> bus_probe_device |->device_initial_probe |->__device_attach |-> pm_runtime_get_sync(dev->parent); |-> pm_request_idle(dev); |-> pm_runtime_put(dev->parent); So in the end, before normal console ready, the lpuart get runtime suspended. And earlycon putchar will hang. To address the issue, mark last busy just after pm_runtime_enable, three seconds is long enough to switch from bootconsole to normal console. Fixes: 43543e6f539b ("tty: serial: fsl_lpuart: Add runtime pm support") Cc: stable Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20240808140325.580105-1-peng.fan@oss.nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/fsl_lpuart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 615291ea9b5e..77efa7ee6eda 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -2923,6 +2923,7 @@ static int lpuart_probe(struct platform_device *pdev) pm_runtime_set_autosuspend_delay(&pdev->dev, UART_AUTOSUSPEND_TIMEOUT); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); + pm_runtime_mark_last_busy(&pdev->dev); ret = lpuart_global_reset(sport); if (ret) From 7258fdd7d7459616b3fe1a603e33900584b10c13 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 10 Aug 2024 01:07:20 +0900 Subject: [PATCH 0791/1052] tty: vt: conmakehash: remove non-portable code printing comment header Commit 6e20753da6bc ("tty: vt: conmakehash: cope with abs_srctree no longer in env") included , which invoked another (wrong) patch that tried to address a build error on macOS. According to the specification [1], the correct header to use PATH_MAX is . The minimal fix would be to replace with . However, the following commits seem questionable to me: - 3bd85c6c97b2 ("tty: vt: conmakehash: Don't mention the full path of the input in output") - 6e20753da6bc ("tty: vt: conmakehash: cope with abs_srctree no longer in env") These commits made too many efforts to cope with a comment header in drivers/tty/vt/consolemap_deftbl.c: /* * Do not edit this file; it was automatically generated by * * conmakehash drivers/tty/vt/cp437.uni > [this file] * */ With this commit, the header part of the generate C file will be simplified as follows: /* * Automatically generated file; Do not edit. */ BTW, another series of excessive efforts for a comment header can be seen in the following: - 5ef6dc08cfde ("lib/build_OID_registry: don't mention the full path of the script in output") - 2fe29fe94563 ("lib/build_OID_registry: avoid non-destructive substitution for Perl < 5.13.2 compat") [1]: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/limits.h.html Fixes: 6e20753da6bc ("tty: vt: conmakehash: cope with abs_srctree no longer in env") Cc: stable Reported-by: Daniel Gomez Closes: https://lore.kernel.org/all/20240807-macos-build-support-v1-11-4cd1ded85694@samsung.com/ Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240809160853.1269466-1-masahiroy@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/conmakehash.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/tty/vt/conmakehash.c b/drivers/tty/vt/conmakehash.c index 82d9db68b2ce..a931fcde7ad9 100644 --- a/drivers/tty/vt/conmakehash.c +++ b/drivers/tty/vt/conmakehash.c @@ -11,8 +11,6 @@ * Copyright (C) 1995-1997 H. Peter Anvin */ -#include -#include #include #include #include @@ -79,7 +77,6 @@ int main(int argc, char *argv[]) { FILE *ctbl; const char *tblname; - char base_tblname[PATH_MAX]; char buffer[65536]; int fontlen; int i, nuni, nent; @@ -245,20 +242,15 @@ int main(int argc, char *argv[]) for ( i = 0 ; i < fontlen ; i++ ) nuni += unicount[i]; - strncpy(base_tblname, tblname, PATH_MAX); - base_tblname[PATH_MAX - 1] = 0; printf("\ /*\n\ - * Do not edit this file; it was automatically generated by\n\ - *\n\ - * conmakehash %s > [this file]\n\ - *\n\ + * Automatically generated file; Do not edit.\n\ */\n\ \n\ #include \n\ \n\ u8 dfont_unicount[%d] = \n\ -{\n\t", basename(base_tblname), fontlen); +{\n\t", fontlen); for ( i = 0 ; i < fontlen ; i++ ) { From c9f6613b16123989f2c3bd04b1d9b2365d6914e7 Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Thu, 8 Aug 2024 08:06:37 +0200 Subject: [PATCH 0792/1052] tty: atmel_serial: use the correct RTS flag. In RS485 mode, the RTS pin is driven high by hardware when the transmitter is operating. This behaviour cannot be changed. This means that the driver should claim that it supports SER_RS485_RTS_ON_SEND and not SER_RS485_RTS_AFTER_SEND. Otherwise, when configuring the port with the SER_RS485_RTS_ON_SEND, one get the following warning: kern.warning kernel: atmel_usart_serial atmel_usart_serial.2.auto: ttyS1 (1): invalid RTS setting, using RTS_AFTER_SEND instead which is contradictory with what's really happening. Signed-off-by: Mathieu Othacehe Cc: stable Tested-by: Alexander Dahl Fixes: af47c491e3c7 ("serial: atmel: Fill in rs485_supported") Link: https://lore.kernel.org/r/20240808060637.19886-1-othacehe@gnu.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/atmel_serial.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 0a90964d6d10..09b246c9e389 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -2514,7 +2514,7 @@ static const struct uart_ops atmel_pops = { }; static const struct serial_rs485 atmel_rs485_supported = { - .flags = SER_RS485_ENABLED | SER_RS485_RTS_AFTER_SEND | SER_RS485_RX_DURING_TX, + .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RX_DURING_TX, .delay_rts_before_send = 1, .delay_rts_after_send = 1, }; From abfceba0a7a246ac082bf569807738ff7416f59f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 5 Aug 2024 16:20:20 -0700 Subject: [PATCH 0793/1052] ARM: riscpc: ecard: Fix the build Fix a recently introduced build failure. Cc: Russell King Fixes: d69d80484598 ("driver core: have match() callback in struct bus_type take a const *") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240805232026.65087-2-bvanassche@acm.org Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-rpc/ecard.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c index c30df1097c52..9f7454b8efa7 100644 --- a/arch/arm/mach-rpc/ecard.c +++ b/arch/arm/mach-rpc/ecard.c @@ -1109,7 +1109,7 @@ void ecard_remove_driver(struct ecard_driver *drv) driver_unregister(&drv->drv); } -static int ecard_match(struct device *_dev, struct device_driver *_drv) +static int ecard_match(struct device *_dev, const struct device_driver *_drv) { struct expansion_card *ec = ECARD_DEV(_dev); struct ecard_driver *drv = ECARD_DRV(_drv); From cdd1fa91a6b8c7cd93b3abf9f3ef05b8ce741b61 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 5 Aug 2024 16:20:21 -0700 Subject: [PATCH 0794/1052] mips: sgi-ip22: Fix the build Fix a recently introduced build failure. Fixes: d69d80484598 ("driver core: have match() callback in struct bus_type take a const *") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240805232026.65087-3-bvanassche@acm.org Signed-off-by: Greg Kroah-Hartman --- arch/mips/sgi-ip22/ip22-gio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/sgi-ip22/ip22-gio.c b/arch/mips/sgi-ip22/ip22-gio.c index 2738325e98dd..d20eec742bfa 100644 --- a/arch/mips/sgi-ip22/ip22-gio.c +++ b/arch/mips/sgi-ip22/ip22-gio.c @@ -111,7 +111,7 @@ void gio_device_unregister(struct gio_device *giodev) } EXPORT_SYMBOL_GPL(gio_device_unregister); -static int gio_bus_match(struct device *dev, struct device_driver *drv) +static int gio_bus_match(struct device *dev, const struct device_driver *drv) { struct gio_device *gio_dev = to_gio_device(dev); struct gio_driver *gio_drv = to_gio_driver(drv); From 479ffee68d59c599f8aed8fa2dcc8e13e7bd13c3 Mon Sep 17 00:00:00 2001 From: Bert Karwatzki Date: Mon, 12 Aug 2024 12:45:41 +0200 Subject: [PATCH 0795/1052] wifi: mt76: mt7921: fix NULL pointer access in mt7921_ipv6_addr_change When disabling wifi mt7921_ipv6_addr_change() is called as a notifier. At this point mvif->phy is already NULL so we cannot use it here. Signed-off-by: Bert Karwatzki Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240812104542.80760-1-spasswolf@web.de --- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 1bab93d049df..23b228804289 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw *hw, struct inet6_dev *idev) { struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; - struct mt792x_dev *dev = mvif->phy->dev; + struct mt792x_dev *dev = mt792x_hw_dev(hw); struct inet6_ifaddr *ifa; struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; struct sk_buff *skb; From 38c8d02501c09454e4fbf0f67de03de35e94d384 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 12 Aug 2024 13:06:40 +0200 Subject: [PATCH 0796/1052] wifi: iwlwifi: correctly lookup DMA address in SG table The code to lookup the scatter gather table entry assumed that it was possible to use sg_virt() in order to lookup the DMA address in a mapped scatter gather table. However, this assumption is incorrect as the DMA mapping code may merge multiple entries into one. In that case, the DMA address space may have e.g. two consecutive pages which is correctly represented by the scatter gather list entry, however the virtual addresses for these two pages may differ and the relationship cannot be resolved anymore. Avoid this problem entirely by working with the offset into the mapped area instead of using virtual addresses. With that we only use the DMA length and DMA address from the scatter gather list entries. The underlying DMA/IOMMU code is therefore free to merge two entries into one even if the virtual addresses space for the area is not continuous. Fixes: 90db50755228 ("wifi: iwlwifi: use already mapped data when TXing an AMSDU") Reported-by: Chris Bainbridge Closes: https://lore.kernel.org/r/ZrNRoEbdkxkKFMBi@debian.local Signed-off-by: Benjamin Berg Tested-by: Chris Bainbridge Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240812110640.460514-1-benjamin@sipsolutions.net --- .../wireless/intel/iwlwifi/pcie/internal.h | 3 +- .../net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 5 ++- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 32 +++++++++++++------ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index b59de4f80b4b..27a7e0b5b3d5 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -639,7 +639,8 @@ void iwl_trans_pcie_tx_reset(struct iwl_trans *trans); int iwl_pcie_txq_alloc(struct iwl_trans *trans, struct iwl_txq *txq, int slots_num, bool cmd_queue); -dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, void *addr); +dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, unsigned int offset, + unsigned int len); struct sg_table *iwl_pcie_prep_tso(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_cmd_meta *cmd_meta, u8 **hdr, unsigned int hdr_room); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 2e780fb2da42..b1846abb99b7 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -168,6 +168,7 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans, struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int snap_ip_tcp_hdrlen, ip_hdrlen, total_len, hdr_room; unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int data_offset = 0; dma_addr_t start_hdr_phys; u16 length, amsdu_pad; u8 *start_hdr; @@ -260,7 +261,8 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans, int ret; tb_len = min_t(unsigned int, tso.size, data_left); - tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, tso.data); + tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, data_offset, + tb_len); /* Not a real mapping error, use direct comparison */ if (unlikely(tb_phys == DMA_MAPPING_ERROR)) goto out_err; @@ -272,6 +274,7 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans, goto out_err; data_left -= tb_len; + data_offset += tb_len; tso_build_data(skb, &tso, tb_len); } } diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index 22d482ae53d9..9fe050f0ddc1 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -1814,23 +1814,31 @@ static void *iwl_pcie_get_page_hdr(struct iwl_trans *trans, /** * iwl_pcie_get_sgt_tb_phys - Find TB address in mapped SG list * @sgt: scatter gather table - * @addr: Virtual address + * @offset: Offset into the mapped memory (i.e. SKB payload data) + * @len: Length of the area * - * Find the entry that includes the address for the given address and return - * correct physical address for the TB entry. + * Find the DMA address that corresponds to the SKB payload data at the + * position given by @offset. * * Returns: Address for TB entry */ -dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, void *addr) +dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, unsigned int offset, + unsigned int len) { struct scatterlist *sg; + unsigned int sg_offset = 0; int i; + /* + * Search the mapped DMA areas in the SG for the area that contains the + * data at offset with the given length. + */ for_each_sgtable_dma_sg(sgt, sg, i) { - if (addr >= sg_virt(sg) && - (u8 *)addr < (u8 *)sg_virt(sg) + sg_dma_len(sg)) - return sg_dma_address(sg) + - ((unsigned long)addr - (unsigned long)sg_virt(sg)); + if (offset >= sg_offset && + offset + len <= sg_offset + sg_dma_len(sg)) + return sg_dma_address(sg) + offset - sg_offset; + + sg_offset += sg_dma_len(sg); } WARN_ON_ONCE(1); @@ -1875,7 +1883,9 @@ struct sg_table *iwl_pcie_prep_tso(struct iwl_trans *trans, struct sk_buff *skb, sg_init_table(sgt->sgl, skb_shinfo(skb)->nr_frags + 1); - sgt->orig_nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len); + /* Only map the data, not the header (it is copied to the TSO page) */ + sgt->orig_nents = skb_to_sgvec(skb, sgt->sgl, skb_headlen(skb), + skb->data_len); if (WARN_ON_ONCE(sgt->orig_nents <= 0)) return NULL; @@ -1900,6 +1910,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int snap_ip_tcp_hdrlen, ip_hdrlen, total_len, hdr_room; unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int data_offset = 0; u16 length, iv_len, amsdu_pad; dma_addr_t start_hdr_phys; u8 *start_hdr, *pos_hdr; @@ -2000,7 +2011,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, data_left); dma_addr_t tb_phys; - tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, tso.data); + tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, data_offset, size); /* Not a real mapping error, use direct comparison */ if (unlikely(tb_phys == DMA_MAPPING_ERROR)) return -EINVAL; @@ -2011,6 +2022,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, tb_phys, size); data_left -= size; + data_offset += size; tso_build_data(skb, &tso, size); } } From 92b6c2f0076c50aaa919d16b595f34f3e9967bea Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jun 2024 14:50:38 +0300 Subject: [PATCH 0797/1052] KVM: SVM: Fix uninitialized variable bug If snp_lookup_rmpentry() fails then "assigned" is printed in the error message but it was never initialized. Initialize it to false. Fixes: dee5a47cc7a4 ("KVM: SEV: Add KVM_SEV_SNP_LAUNCH_UPDATE command") Signed-off-by: Dan Carpenter Message-ID: <20240612115040.2423290-3-dan.carpenter@linaro.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 532df12b43c5..393f450adbc3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2276,7 +2276,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { struct sev_data_snp_launch_update fw_args = {0}; - bool assigned; + bool assigned = false; int level; ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); From cd2d00606553e631e9b5d11cca7da38fc95433e6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jun 2024 14:50:39 +0300 Subject: [PATCH 0798/1052] KVM: SVM: Fix an error code in sev_gmem_post_populate() The copy_from_user() function returns the number of bytes which it was not able to copy. Return -EFAULT instead. Fixes: dee5a47cc7a4 ("KVM: SEV: Add KVM_SEV_SNP_LAUNCH_UPDATE command") Signed-off-by: Dan Carpenter Message-ID: <20240612115040.2423290-4-dan.carpenter@linaro.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 393f450adbc3..714c517dd4b7 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2290,9 +2290,10 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf if (src) { void *vaddr = kmap_local_pfn(pfn + i); - ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); - if (ret) + if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { + ret = -EFAULT; goto err; + } kunmap_local(vaddr); } From 58a63729c957621f1990c3494c702711188ca347 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 9 Aug 2024 08:58:58 -0700 Subject: [PATCH 0799/1052] net: mana: Fix doorbell out of order violation and avoid unnecessary doorbell rings After napi_complete_done() is called when NAPI is polling in the current process context, another NAPI may be scheduled and start running in softirq on another CPU and may ring the doorbell before the current CPU does. When combined with unnecessary rings when there is no need to arm the CQ, it triggers error paths in the hardware. This patch fixes this by calling napi_complete_done() after doorbell rings. It limits the number of unnecessary rings when there is no need to arm. MANA hardware specifies that there must be one doorbell ring every 8 CQ wraparounds. This driver guarantees one doorbell ring as soon as the number of consumed CQEs exceeds 4 CQ wraparounds. In practical workloads, the 4 CQ wraparounds proves to be big enough that it rarely exceeds this limit before all the napi weight is consumed. To implement this, add a per-CQ counter cq->work_done_since_doorbell, and make sure the CQ is armed as soon as passing 4 wraparounds of the CQ. Cc: stable@vger.kernel.org Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ") Reviewed-by: Haiyang Zhang Signed-off-by: Long Li Link: https://patch.msgid.link/1723219138-29887-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 22 ++++++++++++------- include/net/mana/mana.h | 1 + 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index ae717d06e66f..39f56973746d 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1792,7 +1792,6 @@ static void mana_poll_rx_cq(struct mana_cq *cq) static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) { struct mana_cq *cq = context; - u8 arm_bit; int w; WARN_ON_ONCE(cq->gdma_cq != gdma_queue); @@ -1803,16 +1802,23 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) mana_poll_tx_cq(cq); w = cq->work_done; + cq->work_done_since_doorbell += w; - if (w < cq->budget && - napi_complete_done(&cq->napi, w)) { - arm_bit = SET_ARM_BIT; - } else { - arm_bit = 0; + if (w < cq->budget) { + mana_gd_ring_cq(gdma_queue, SET_ARM_BIT); + cq->work_done_since_doorbell = 0; + napi_complete_done(&cq->napi, w); + } else if (cq->work_done_since_doorbell > + cq->gdma_cq->queue_size / COMP_ENTRY_SIZE * 4) { + /* MANA hardware requires at least one doorbell ring every 8 + * wraparounds of CQ even if there is no need to arm the CQ. + * This driver rings the doorbell as soon as we have exceeded + * 4 wraparounds. + */ + mana_gd_ring_cq(gdma_queue, 0); + cq->work_done_since_doorbell = 0; } - mana_gd_ring_cq(gdma_queue, arm_bit); - return w; } diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 6439fd8b437b..7caa334f4888 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -275,6 +275,7 @@ struct mana_cq { /* NAPI data */ struct napi_struct napi; int work_done; + int work_done_since_doorbell; int budget; }; From 12d82c7b0a612372f594e4ff00983a1da3a1d929 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 13 Aug 2024 12:07:50 +0100 Subject: [PATCH 0800/1052] ALSA: hda: cs35l56: Remove redundant call to hda_cs_dsp_control_remove() The driver doesn't create any ALSA controls for firmware controls, so it shouldn't be calling hda_cs_dsp_control_remove(). commit 34e1b1bb7324 ("ALSA: hda: cs35l56: Stop creating ALSA controls for firmware coefficients") removed the call to hda_cs_dsp_add_controls() but didn't remove the call for destroying those controls. Signed-off-by: Richard Fitzgerald Fixes: 34e1b1bb7324 ("ALSA: hda: cs35l56: Stop creating ALSA controls for firmware coefficients") Link: https://patch.msgid.link/20240813110750.2814-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 31cc92bac89a..a9dfd62637cf 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -413,7 +413,7 @@ static void cs35l56_hda_remove_controls(struct cs35l56_hda *cs35l56) } static const struct cs_dsp_client_ops cs35l56_hda_client_ops = { - .control_remove = hda_cs_dsp_control_remove, + /* cs_dsp requires the client to provide this even if it is empty */ }; static int cs35l56_hda_request_firmware_file(struct cs35l56_hda *cs35l56, From 42fac187b5c746227c92d024f1caf33bc1d337e4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Apr 2024 16:41:20 -0400 Subject: [PATCH 0801/1052] btrfs: check delayed refs when we're checking if a ref exists In the patch 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete resume") I added some code to handle file systems that had been corrupted by a bug that incorrectly skipped updating the drop progress key while dropping a snapshot. This code would check to see if we had already deleted our reference for a child block, and skip the deletion if we had already. Unfortunately there is a bug, as the check would only check the on-disk references. I made an incorrect assumption that blocks in an already deleted snapshot that was having the deletion resume on mount wouldn't be modified. If we have 2 pending deleted snapshots that share blocks, we can easily modify the rules for a block. Take the following example subvolume a exists, and subvolume b is a snapshot of subvolume a. They share references to block 1. Block 1 will have 2 full references, one for subvolume a and one for subvolume b, and it belongs to subvolume a (btrfs_header_owner(block 1) == subvolume a). When deleting subvolume a, we will drop our full reference for block 1, and because we are the owner we will drop our full reference for all of block 1's children, convert block 1 to FULL BACKREF, and add a shared reference to all of block 1's children. Then we will start the snapshot deletion of subvolume b. We look up the extent info for block 1, which checks delayed refs and tells us that FULL BACKREF is set, so sets parent to the bytenr of block 1. However because this is a resumed snapshot deletion, we call into check_ref_exists(). Because check_ref_exists() only looks at the disk, it doesn't find the shared backref for the child of block 1, and thus returns 0 and we skip deleting the reference for the child of block 1 and continue. This orphans the child of block 1. The fix is to lookup the delayed refs, similar to what we do in btrfs_lookup_extent_info(). However we only care about whether the reference exists or not. If we fail to find our reference on disk, go look up the bytenr in the delayed refs, and if it exists look for an existing ref in the delayed ref head. If that exists then we know we can delete the reference safely and carry on. If it doesn't exist we know we have to skip over this block. This bug has existed since I introduced this fix, however requires having multiple deleted snapshots pending when we unmount. We noticed this in production because our shutdown path stops the container on the system, which deletes a bunch of subvolumes, and then reboots the box. This gives us plenty of opportunities to hit this issue. Looking at the history we've seen this occasionally in production, but we had a big spike recently thanks to faster machines getting jobs with multiple subvolumes in the job. Chris Mason wrote a reproducer which does the following mount /dev/nvme4n1 /btrfs btrfs subvol create /btrfs/s1 simoop -E -f 4k -n 200000 -z /btrfs/s1 while(true) ; do btrfs subvol snap /btrfs/s1 /btrfs/s2 simoop -f 4k -n 200000 -r 10 -z /btrfs/s2 btrfs subvol snap /btrfs/s2 /btrfs/s3 btrfs balance start -dusage=80 /btrfs btrfs subvol del /btrfs/s2 /btrfs/s3 umount /btrfs btrfsck /dev/nvme4n1 || exit 1 mount /dev/nvme4n1 /btrfs done On the second loop this would fail consistently, with my patch it has been running for hours and hasn't failed. I also used dm-log-writes to capture the state of the failure so I could debug the problem. Using the existing failure case to test my patch validated that it fixes the problem. Fixes: 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete resume") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 67 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/delayed-ref.h | 2 ++ fs/btrfs/extent-tree.c | 51 ++++++++++++++++++++++++++++---- 3 files changed, 114 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 2ac9296edccb..06a9e0542d70 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1134,6 +1134,73 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt return find_ref_head(delayed_refs, bytenr, false); } +static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) +{ + int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; + + if (type < entry->type) + return -1; + if (type > entry->type) + return 1; + + if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (root < entry->ref_root) + return -1; + if (root > entry->ref_root) + return 1; + } else { + if (parent < entry->parent) + return -1; + if (parent > entry->parent) + return 1; + } + return 0; +} + +/* + * Check to see if a given root/parent reference is attached to the head. This + * only checks for BTRFS_ADD_DELAYED_REF references that match, as that + * indicates the reference exists for the given root or parent. This is for + * tree blocks only. + * + * @head: the head of the bytenr we're searching. + * @root: the root objectid of the reference if it is a normal reference. + * @parent: the parent if this is a shared backref. + */ +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent) +{ + struct rb_node *node; + bool found = false; + + lockdep_assert_held(&head->mutex); + + spin_lock(&head->lock); + node = head->ref_tree.rb_root.rb_node; + while (node) { + struct btrfs_delayed_ref_node *entry; + int ret; + + entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + ret = find_comp(entry, root, parent); + if (ret < 0) { + node = node->rb_left; + } else if (ret > 0) { + node = node->rb_right; + } else { + /* + * We only want to count ADD actions, as drops mean the + * ref doesn't exist. + */ + if (entry->action == BTRFS_ADD_DELAYED_REF) + found = true; + break; + } + } + spin_unlock(&head->lock); + return found; +} + void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ef15e998be03..05f634eb472d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -389,6 +389,8 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent); static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ff9f0d41987e..feec49e6f9c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5472,23 +5472,62 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; struct btrfs_path *path; struct btrfs_extent_inline_ref *iref; int ret; + bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, btrfs_root_id(root), level, 0); + if (ret != -ENOENT) { + /* + * If we get 0 then we found our reference, return 1, else + * return the error if it's not -ENOENT; + */ + btrfs_free_path(path); + return (ret < 0 ) ? ret : 1; + } + + /* + * We could have a delayed ref with this reference, so look it up while + * we're holding the path open to make sure we don't race with the + * delayed ref running. + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (!head) + goto out; + if (!mutex_trylock(&head->mutex)) { + /* + * We're contended, means that the delayed ref is running, get a + * reference and wait for the ref head to be complete and then + * try again. + */ + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto again; + } + + exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); btrfs_free_path(path); - if (ret == -ENOENT) - return 0; - if (ret < 0) - return ret; - return 1; + return exists ? 1 : 0; } /* From 31723c9542dba1681cc3720571fdf12ffe0eddd9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 12 Aug 2024 08:52:44 +0930 Subject: [PATCH 0802/1052] btrfs: tree-checker: reject BTRFS_FT_UNKNOWN dir type [REPORT] There is a bug report that kernel is rejecting a mismatching inode mode and its dir item: [ 1881.553937] BTRFS critical (device dm-0): inode mode mismatch with dir: inode mode=040700 btrfs type=2 dir type=0 [CAUSE] It looks like the inode mode is correct, while the dir item type 0 is BTRFS_FT_UNKNOWN, which should not be generated by btrfs at all. This may be caused by a memory bit flip. [ENHANCEMENT] Although tree-checker is not able to do any cross-leaf verification, for this particular case we can at least reject any dir type with BTRFS_FT_UNKNOWN. So here we enhance the dir type check from [0, BTRFS_FT_MAX), to (0, BTRFS_FT_MAX). Although the existing corruption can not be fixed just by such enhanced checking, it should prevent the same 0x2->0x0 bitflip for dir type to reach disk in the future. Reported-by: Kota Link: https://lore.kernel.org/linux-btrfs/CACsxjPYnQF9ZF-0OhH16dAx50=BXXOcP74MxBc3BG+xae4vTTw@mail.gmail.com/ CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a825fa598e3c..6f1e2f2215d9 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -569,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf, /* dir type check */ dir_type = btrfs_dir_ftype(leaf, di); - if (unlikely(dir_type >= BTRFS_FT_MAX)) { + if (unlikely(dir_type <= BTRFS_FT_UNKNOWN || + dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, - "invalid dir item type, have %u expect [0, %u)", + "invalid dir item type, have %u expect (0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; } From 8475a1d9bb7acf1cb15842dd24baab0e8ea4e4ff Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 13 Aug 2024 12:32:09 +0100 Subject: [PATCH 0803/1052] ALSA: hda: cs35l41: Remove redundant call to hda_cs_dsp_control_remove() The driver doesn't create any ALSA controls for firmware controls, so it shouldn't be calling hda_cs_dsp_control_remove(). commit 312c04cee408 ("ALSA: hda: cs35l41: Stop creating ALSA Controls for firmware coefficients") removed the call to hda_cs_dsp_add_controls() but didn't remove the call for destroying those controls. Signed-off-by: Richard Fitzgerald Fixes: 312c04cee408 ("ALSA: hda: cs35l41: Stop creating ALSA Controls for firmware coefficients") Link: https://patch.msgid.link/20240813113209.648-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 3a92e98da72d..d68bf7591d90 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -134,7 +134,7 @@ static const struct reg_sequence cs35l41_hda_mute[] = { }; static const struct cs_dsp_client_ops client_ops = { - .control_remove = hda_cs_dsp_control_remove, + /* cs_dsp requires the client to provide this even if it is empty */ }; static int cs35l41_request_tuning_param_file(struct cs35l41_hda *cs35l41, char *tuning_filename, From ae1e766f623f7a2a889a0b09eb076dd9a60efbe9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 11 Aug 2024 11:53:42 +0100 Subject: [PATCH 0804/1052] btrfs: only run the extent map shrinker from kswapd tasks Currently the extent map shrinker can be run by any task when attempting to allocate memory and there's enough memory pressure to trigger it. To avoid too much latency we stop iterating over extent maps and removing them once the task needs to reschedule. This logic was introduced in commit b3ebb9b7e92a ("btrfs: stop extent map shrinker if reschedule is needed"). While that solved high latency problems for some use cases, it's still not enough because with a too high number of tasks entering the extent map shrinker code, either due to memory allocations or because they are a kswapd task, we end up having a very high level of contention on some spin locks, namely: 1) The fs_info->fs_roots_radix_lock spin lock, which we need to find roots to iterate over their inodes; 2) The spin lock of the xarray used to track open inodes for a root (struct btrfs_root::inodes) - on 6.10 kernels and below, it used to be a red black tree and the spin lock was root->inode_lock; 3) The fs_info->delayed_iput_lock spin lock since the shrinker adds delayed iputs (calls btrfs_add_delayed_iput()). Instead of allowing the extent map shrinker to be run by any task, make it run only by kswapd tasks. This still solves the problem of running into OOM situations due to an unbounded extent map creation, which is simple to trigger by direct IO writes, as described in the changelog of commit 956a17d9d050 ("btrfs: add a shrinker for extent maps"), and by a similar case when doing buffered IO on files with a very large number of holes (keeping the file open and creating many holes, whose extent maps are only released when the file is closed). Reported-by: kzd Link: https://bugzilla.kernel.org/show_bug.cgi?id=219121 Reported-by: Octavia Togami Link: https://lore.kernel.org/linux-btrfs/CAHPNGSSt-a4ZZWrtJdVyYnJFscFjP9S7rMcvEMaNSpR556DdLA@mail.gmail.com/ Fixes: 956a17d9d050 ("btrfs: add a shrinker for extent maps") CC: stable@vger.kernel.org # 6.10+ Tested-by: kzd Tested-by: Octavia Togami Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 22 ++++++---------------- fs/btrfs/super.c | 10 ++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 23b65dc73c00..10ac5f657e38 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1147,8 +1147,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c return 0; /* - * We want to be fast because we can be called from any path trying to - * allocate memory, so if the lock is busy we don't want to spend time + * We want to be fast so if the lock is busy we don't want to spend time * waiting for it - either some task is about to do IO for the inode or * we may have another task shrinking extent maps, here in this code, so * skip this inode. @@ -1191,9 +1190,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c /* * Stop if we need to reschedule or there's contention on the * lock. This is to avoid slowing other tasks trying to take the - * lock and because the shrinker might be called during a memory - * allocation path and we want to avoid taking a very long time - * and slowing down all sorts of tasks. + * lock. */ if (need_resched() || rwlock_needbreak(&tree->lock)) break; @@ -1222,12 +1219,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx if (ctx->scanned >= ctx->nr_to_scan) break; - /* - * We may be called from memory allocation paths, so we don't - * want to take too much time and slowdown tasks. - */ - if (need_resched()) - break; + cond_resched(); inode = btrfs_find_first_inode(root, min_ino); } @@ -1285,14 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) ctx.last_ino); } - /* - * We may be called from memory allocation paths, so we don't want to - * take too much time and slowdown tasks, so stop if we need reschedule. - */ - while (ctx.scanned < ctx.nr_to_scan && !need_resched()) { + while (ctx.scanned < ctx.nr_to_scan) { struct btrfs_root *root; unsigned long count; + cond_resched(); + spin_lock(&fs_info->fs_roots_radix_lock); count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)&root, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 83478deada3b..11044e9e2cb1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -2409,6 +2410,15 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); + /* + * We may be called from any task trying to allocate memory and we don't + * want to slow it down with scanning and dropping extent maps. It would + * also cause heavy lock contention if many tasks concurrently enter + * here. Therefore only allow kswapd tasks to scan and drop extent maps. + */ + if (!current_is_kswapd()) + return 0; + return btrfs_free_extent_maps(fs_info, nr_to_scan); } From 779bac9994452f6a894524f70c00cfb0cd4b6364 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Aug 2024 15:08:04 +0200 Subject: [PATCH 0805/1052] Revert "ACPI: EC: Evaluate orphan _REG under EC device" This reverts commit 0e6b6dedf168 ("Revert "ACPI: EC: Evaluate orphan _REG under EC device") because the problem addressed by it will be addressed differently in what follows. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Cc: All applicable Link: https://patch.msgid.link/3236716.5fSG56mABF@rjwysocki.net --- drivers/acpi/acpica/acevents.h | 4 --- drivers/acpi/acpica/evregion.c | 6 +++- drivers/acpi/acpica/evxfregn.c | 54 ---------------------------------- drivers/acpi/ec.c | 3 -- include/acpi/acpixf.h | 4 --- 5 files changed, 5 insertions(+), 66 deletions(-) diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h index 2133085deda7..ddd072cbc738 100644 --- a/drivers/acpi/acpica/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -191,10 +191,6 @@ void acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, acpi_adr_space_type space_id, u32 function); -void -acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *node, - acpi_adr_space_type space_id); - acpi_status acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function); diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c index dc6004daf624..18fdf2bc2d49 100644 --- a/drivers/acpi/acpica/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -20,6 +20,10 @@ extern u8 acpi_gbl_default_address_spaces[]; /* Local prototypes */ +static void +acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *device_node, + acpi_adr_space_type space_id); + static acpi_status acpi_ev_reg_run(acpi_handle obj_handle, u32 level, void *context, void **return_value); @@ -814,7 +818,7 @@ acpi_ev_reg_run(acpi_handle obj_handle, * ******************************************************************************/ -void +static void acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *device_node, acpi_adr_space_type space_id) { diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index 624361a5f34d..3197e6303c5b 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -306,57 +306,3 @@ acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) } ACPI_EXPORT_SYMBOL(acpi_execute_reg_methods) - -/******************************************************************************* - * - * FUNCTION: acpi_execute_orphan_reg_method - * - * PARAMETERS: device - Handle for the device - * space_id - The address space ID - * - * RETURN: Status - * - * DESCRIPTION: Execute an "orphan" _REG method that appears under an ACPI - * device. This is a _REG method that has no corresponding region - * within the device's scope. - * - ******************************************************************************/ -acpi_status -acpi_execute_orphan_reg_method(acpi_handle device, acpi_adr_space_type space_id) -{ - struct acpi_namespace_node *node; - acpi_status status; - - ACPI_FUNCTION_TRACE(acpi_execute_orphan_reg_method); - - /* Parameter validation */ - - if (!device) { - return_ACPI_STATUS(AE_BAD_PARAMETER); - } - - status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } - - /* Convert and validate the device handle */ - - node = acpi_ns_validate_handle(device); - if (node) { - - /* - * If an "orphan" _REG method is present in the device's scope - * for the given address space ID, run it. - */ - - acpi_ev_execute_orphan_reg_method(node, space_id); - } else { - status = AE_BAD_PARAMETER; - } - - (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); - return_ACPI_STATUS(status); -} - -ACPI_EXPORT_SYMBOL(acpi_execute_orphan_reg_method) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 299ec653388c..68dd17f96f63 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1507,9 +1507,6 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { acpi_execute_reg_methods(scope_handle, ACPI_ADR_SPACE_EC); - if (scope_handle != ec->handle) - acpi_execute_orphan_reg_method(ec->handle, ACPI_ADR_SPACE_EC); - set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); } diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 80dc36f9d527..94d0fc3bd412 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -662,10 +662,6 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id)) -ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_execute_orphan_reg_method(acpi_handle device, - acpi_adr_space_type - space_id)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_remove_address_space_handler(acpi_handle device, From cdf65d73e001fde600b18d7e45afadf559425ce5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Aug 2024 15:11:42 +0200 Subject: [PATCH 0806/1052] ACPICA: Add a depth argument to acpi_execute_reg_methods() A subsequent change will need to pass a depth argument to acpi_execute_reg_methods(), so prepare that function for it. No intentional functional changes. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Cc: All applicable Link: https://patch.msgid.link/8451567.NyiUUSuA9g@rjwysocki.net --- drivers/acpi/acpica/acevents.h | 2 +- drivers/acpi/acpica/evregion.c | 6 ++++-- drivers/acpi/acpica/evxfregn.c | 10 +++++++--- drivers/acpi/ec.c | 2 +- include/acpi/acpixf.h | 1 + 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h index ddd072cbc738..1c5218b79fc2 100644 --- a/drivers/acpi/acpica/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -188,7 +188,7 @@ acpi_ev_detach_region(union acpi_operand_object *region_obj, u8 acpi_ns_is_locked); void -acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, +acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, u32 max_depth, acpi_adr_space_type space_id, u32 function); acpi_status diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c index 18fdf2bc2d49..cf53b9535f18 100644 --- a/drivers/acpi/acpica/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -65,6 +65,7 @@ acpi_status acpi_ev_initialize_op_regions(void) acpi_gbl_default_address_spaces [i])) { acpi_ev_execute_reg_methods(acpi_gbl_root_node, + ACPI_UINT32_MAX, acpi_gbl_default_address_spaces [i], ACPI_REG_CONNECT); } @@ -672,6 +673,7 @@ acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function) * FUNCTION: acpi_ev_execute_reg_methods * * PARAMETERS: node - Namespace node for the device + * max_depth - Depth to which search for _REG * space_id - The address space ID * function - Passed to _REG: On (1) or Off (0) * @@ -683,7 +685,7 @@ acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function) ******************************************************************************/ void -acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, +acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, u32 max_depth, acpi_adr_space_type space_id, u32 function) { struct acpi_reg_walk_info info; @@ -717,7 +719,7 @@ acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, * regions and _REG methods. (i.e. handlers must be installed for all * regions of this Space ID before we can run any _REG methods) */ - (void)acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, ACPI_UINT32_MAX, + (void)acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, max_depth, ACPI_NS_WALK_UNLOCK, acpi_ev_reg_run, NULL, &info, NULL); diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index 3197e6303c5b..95f78383bbdb 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -85,7 +85,8 @@ acpi_install_address_space_handler_internal(acpi_handle device, /* Run all _REG methods for this address space */ if (run_reg) { - acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); + acpi_ev_execute_reg_methods(node, ACPI_UINT32_MAX, space_id, + ACPI_REG_CONNECT); } unlock_and_exit: @@ -263,6 +264,7 @@ ACPI_EXPORT_SYMBOL(acpi_remove_address_space_handler) * FUNCTION: acpi_execute_reg_methods * * PARAMETERS: device - Handle for the device + * max_depth - Depth to which search for _REG * space_id - The address space ID * * RETURN: Status @@ -271,7 +273,8 @@ ACPI_EXPORT_SYMBOL(acpi_remove_address_space_handler) * ******************************************************************************/ acpi_status -acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) +acpi_execute_reg_methods(acpi_handle device, u32 max_depth, + acpi_adr_space_type space_id) { struct acpi_namespace_node *node; acpi_status status; @@ -296,7 +299,8 @@ acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) /* Run all _REG methods for this address space */ - acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); + acpi_ev_execute_reg_methods(node, max_depth, space_id, + ACPI_REG_CONNECT); } else { status = AE_BAD_PARAMETER; } diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 68dd17f96f63..d9c12db80f11 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1506,7 +1506,7 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, } if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { - acpi_execute_reg_methods(scope_handle, ACPI_ADR_SPACE_EC); + acpi_execute_reg_methods(scope_handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); } diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 94d0fc3bd412..9f1c1d225e32 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -660,6 +660,7 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status void *context)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_execute_reg_methods(acpi_handle device, + u32 nax_depth, acpi_adr_space_type space_id)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status From 71bf41b8e913ec9fc91f0d39ab8fb320229ec604 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Aug 2024 15:16:21 +0200 Subject: [PATCH 0807/1052] ACPI: EC: Evaluate _REG outside the EC scope more carefully Commit 60fa6ae6e6d0 ("ACPI: EC: Install address space handler at the namespace root") caused _REG methods for EC operation regions outside the EC device scope to be evaluated which on some systems leads to the evaluation of _REG methods in the scopes of device objects representing devices that are not present and not functional according to the _STA return values. Some of those device objects represent EC "alternatives" and if _REG is evaluated for their operation regions, the platform firmware may be confused and the platform may start to behave incorrectly. To avoid this problem, only evaluate _REG for EC operation regions located in the scopes of device objects representing known-to-be-present devices. For this purpose, partially revert commit 60fa6ae6e6d0 and trigger the evaluation of _REG for EC operation regions from acpi_bus_attach() for the known-valid devices. Fixes: 60fa6ae6e6d0 ("ACPI: EC: Install address space handler at the namespace root") Link: https://lore.kernel.org/linux-acpi/1f76b7e2-1928-4598-8037-28a1785c2d13@redhat.com Link: https://bugzilla.redhat.com/show_bug.cgi?id=2298938 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2302253 Reported-by: Hans de Goede Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Cc: All applicable Link: https://patch.msgid.link/23612351.6Emhk5qWAg@rjwysocki.net --- drivers/acpi/ec.c | 11 +++++++++-- drivers/acpi/internal.h | 1 + drivers/acpi/scan.c | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index d9c12db80f11..38d2f6e6b12b 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1487,12 +1487,13 @@ static bool install_gpio_irq_event_handler(struct acpi_ec *ec) static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, bool call_reg) { - acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle; acpi_status status; acpi_ec_start(ec, false); if (!test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) { + acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle; + acpi_ec_enter_noirq(ec); status = acpi_install_address_space_handler_no_reg(scope_handle, ACPI_ADR_SPACE_EC, @@ -1506,7 +1507,7 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, } if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { - acpi_execute_reg_methods(scope_handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); + acpi_execute_reg_methods(ec->handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); } @@ -1721,6 +1722,12 @@ static void acpi_ec_remove(struct acpi_device *device) } } +void acpi_ec_register_opregions(struct acpi_device *adev) +{ + if (first_ec && first_ec->handle != adev->handle) + acpi_execute_reg_methods(adev->handle, 1, ACPI_ADR_SPACE_EC); +} + static acpi_status ec_parse_io_ports(struct acpi_resource *resource, void *context) { diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 601b670356e5..aadd4c218b32 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -223,6 +223,7 @@ int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit, acpi_handle handle, acpi_ec_query_func func, void *data); void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit); +void acpi_ec_register_opregions(struct acpi_device *adev); #ifdef CONFIG_PM_SLEEP void acpi_ec_flush_work(void); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 59771412686b..22ae7829a915 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2273,6 +2273,8 @@ static int acpi_bus_attach(struct acpi_device *device, void *first_pass) if (device->handler) goto ok; + acpi_ec_register_opregions(device); + if (!device->flags.initialized) { device->flags.power_manageable = device->power.states[ACPI_STATE_D0].flags.valid; From 46a6e10a1ab16cc71d4a3cab73e79aabadd6b8ea Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Aug 2024 14:18:06 +0100 Subject: [PATCH 0808/1052] btrfs: send: allow cloning non-aligned extent if it ends at i_size If we a find that an extent is shared but its end offset is not sector size aligned, then we don't clone it and issue write operations instead. This is because the reflink (remap_file_range) operation does not allow to clone unaligned ranges, except if the end offset of the range matches the i_size of the source and destination files (and the start offset is sector size aligned). While this is not incorrect because send can only guarantee that a file has the same data in the source and destination snapshots, it's not optimal and generates confusion and surprising behaviour for users. For example, running this test: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT # Use a file size not aligned to any possible sector size. file_size=$((1 * 1024 * 1024 + 5)) # 1MB + 5 bytes dd if=/dev/random of=$MNT/foo bs=$file_size count=1 cp --reflink=always $MNT/foo $MNT/bar btrfs subvolume snapshot -r $MNT/ $MNT/snap rm -f /tmp/send-test btrfs send -f /tmp/send-test $MNT/snap umount $MNT mkfs.btrfs -f $DEV mount $DEV $MNT btrfs receive -vv -f /tmp/send-test $MNT xfs_io -r -c "fiemap -v" $MNT/snap/bar umount $MNT Gives the following result: (...) mkfile o258-7-0 rename o258-7-0 -> bar write bar - offset=0 length=49152 write bar - offset=49152 length=49152 write bar - offset=98304 length=49152 write bar - offset=147456 length=49152 write bar - offset=196608 length=49152 write bar - offset=245760 length=49152 write bar - offset=294912 length=49152 write bar - offset=344064 length=49152 write bar - offset=393216 length=49152 write bar - offset=442368 length=49152 write bar - offset=491520 length=49152 write bar - offset=540672 length=49152 write bar - offset=589824 length=49152 write bar - offset=638976 length=49152 write bar - offset=688128 length=49152 write bar - offset=737280 length=49152 write bar - offset=786432 length=49152 write bar - offset=835584 length=49152 write bar - offset=884736 length=49152 write bar - offset=933888 length=49152 write bar - offset=983040 length=49152 write bar - offset=1032192 length=16389 chown bar - uid=0, gid=0 chmod bar - mode=0644 utimes bar utimes BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=06d640da-9ca1-604c-b87c-3375175a8eb3, stransid=7 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2055]: 26624..28679 2056 0x1 There's no clone operation to clone extents from the file foo into file bar and fiemap confirms there's no shared flag (0x2000). So update send_write_or_clone() so that it proceeds with cloning if the source and destination ranges end at the i_size of the respective files. After this changes the result of the test is: (...) mkfile o258-7-0 rename o258-7-0 -> bar clone bar - source=foo source offset=0 offset=0 length=1048581 chown bar - uid=0, gid=0 chmod bar - mode=0644 utimes bar utimes BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=582420f3-ea7d-564e-bbe5-ce440d622190, stransid=7 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2055]: 26624..28679 2056 0x2001 A test case for fstests will also follow up soon. Link: https://github.com/kdave/btrfs-progs/issues/572#issuecomment-2282841416 CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 52 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ca711a773ef..7fc692fc76e1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6157,25 +6157,51 @@ static int send_write_or_clone(struct send_ctx *sctx, u64 offset = key->offset; u64 end; u64 bs = sctx->send_root->fs_info->sectorsize; + struct btrfs_file_extent_item *ei; + u64 disk_byte; + u64 data_offset; + u64 num_bytes; + struct btrfs_inode_info info = { 0 }; end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); if (offset >= end) return 0; - if (clone_root && IS_ALIGNED(end, bs)) { - struct btrfs_file_extent_item *ei; - u64 disk_byte; - u64 data_offset; + num_bytes = end - offset; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); - data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, path, clone_root, disk_byte, - data_offset, offset, end - offset); - } else { - ret = send_extent_data(sctx, path, offset, end - offset); - } + if (!clone_root) + goto write_data; + + if (IS_ALIGNED(end, bs)) + goto clone_data; + + /* + * If the extent end is not aligned, we can clone if the extent ends at + * the i_size of the inode and the clone range ends at the i_size of the + * source inode, otherwise the clone operation fails with -EINVAL. + */ + if (end != sctx->cur_inode_size) + goto write_data; + + ret = get_inode_info(clone_root->root, clone_root->ino, &info); + if (ret < 0) + return ret; + + if (clone_root->offset + num_bytes == info.size) + goto clone_data; + +write_data: + ret = send_extent_data(sctx, path, offset, num_bytes); + sctx->cur_inode_next_write_offset = end; + return ret; + +clone_data: + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset, + num_bytes); sctx->cur_inode_next_write_offset = end; return ret; } From 1e1fd567d32fcf7544c6e09e0e5bc6c650da6e23 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 13 Aug 2024 12:38:51 +0200 Subject: [PATCH 0809/1052] dm suspend: return -ERESTARTSYS instead of -EINTR This commit changes device mapper, so that it returns -ERESTARTSYS instead of -EINTR when it is interrupted by a signal (so that the ioctl can be restarted). The manpage signal(7) says that the ioctl function should be restarted if the signal was handled with SA_RESTART. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 97fab2087df8..87bb90303435 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2737,7 +2737,7 @@ static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int ta break; if (signal_pending_state(task_state, current)) { - r = -EINTR; + r = -ERESTARTSYS; break; } @@ -2762,7 +2762,7 @@ static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_st break; if (signal_pending_state(task_state, current)) { - r = -EINTR; + r = -ERESTARTSYS; break; } From 7a636b4f03af9d541205f69e373672e7b2b60a8a Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 13 Aug 2024 12:39:52 +0200 Subject: [PATCH 0810/1052] dm resume: don't return EINVAL when signalled If the dm_resume method is called on a device that is not suspended, the method will suspend the device briefly, before resuming it (so that the table will be swapped). However, there was a bug that the return value of dm_suspended_md was not checked. dm_suspended_md may return an error when it is interrupted by a signal. In this case, do_resume would call dm_swap_table, which would return -EINVAL. This commit fixes the logic, so that error returned by dm_suspend is checked and the resume operation is undone. Signed-off-by: Mikulas Patocka Signed-off-by: Khazhismel Kumykov Cc: stable@vger.kernel.org --- drivers/md/dm-ioctl.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index c2c07bfa6471..f299ff393a6a 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1181,8 +1181,26 @@ static int do_resume(struct dm_ioctl *param) suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended_md(md)) - dm_suspend(md, suspend_flags); + if (!dm_suspended_md(md)) { + r = dm_suspend(md, suspend_flags); + if (r) { + down_write(&_hash_lock); + hc = dm_get_mdptr(md); + if (hc && !hc->new_map) { + hc->new_map = new_map; + new_map = NULL; + } else { + r = -ENXIO; + } + up_write(&_hash_lock); + if (new_map) { + dm_sync_table(md); + dm_table_destroy(new_map); + } + dm_put(md); + return r; + } + } old_size = dm_get_size(md); old_map = dm_swap_table(md, new_map); From 2a0629834cd82f05d424bbc193374f9a43d1f87d Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 9 Aug 2024 11:16:28 +0800 Subject: [PATCH 0811/1052] vfs: Don't evict inode under the inode lru traversing context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inode reclaiming process(See function prune_icache_sb) collects all reclaimable inodes and mark them with I_FREEING flag at first, at that time, other processes will be stuck if they try getting these inodes (See function find_inode_fast), then the reclaiming process destroy the inodes by function dispose_list(). Some filesystems(eg. ext4 with ea_inode feature, ubifs with xattr) may do inode lookup in the inode evicting callback function, if the inode lookup is operated under the inode lru traversing context, deadlock problems may happen. Case 1: In function ext4_evict_inode(), the ea inode lookup could happen if ea_inode feature is enabled, the lookup process will be stuck under the evicting context like this: 1. File A has inode i_reg and an ea inode i_ea 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea 3. Then, following three processes running like this: PA PB echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // i_reg is added into lru, lru->i_ea->i_reg prune_icache_sb list_lru_walk_one inode_lru_isolate i_ea->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(i_reg) spin_unlock(&i_reg->i_lock) spin_unlock(lru_lock) rm file A i_reg->nlink = 0 iput(i_reg) // i_reg->nlink is 0, do evict ext4_evict_inode ext4_xattr_delete_inode ext4_xattr_inode_dec_ref_all ext4_xattr_inode_iget ext4_iget(i_ea->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(i_ea) ----→ AA deadlock dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&i_ea->i_state) Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file deleting process holds BASEHD's wbuf->io_mutex while getting the xattr inode, which could race with inode reclaiming process(The reclaiming process could try locking BASEHD's wbuf->io_mutex in inode evicting function), then an ABBA deadlock problem would happen as following: 1. File A has inode ia and a xattr(with inode ixa), regular file B has inode ib and a xattr. 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa 3. Then, following three processes running like this: PA PB PC echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // ib and ia are added into lru, lru->ixa->ib->ia prune_icache_sb list_lru_walk_one inode_lru_isolate ixa->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(ib) spin_unlock(&ib->i_lock) spin_unlock(lru_lock) rm file B ib->nlink = 0 rm file A iput(ia) ubifs_evict_inode(ia) ubifs_jnl_delete_inode(ia) ubifs_jnl_write_inode(ia) make_reservation(BASEHD) // Lock wbuf->io_mutex ubifs_iget(ixa->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(ixa) | iput(ib) // ib->nlink is 0, do evict | ubifs_evict_inode | ubifs_jnl_delete_inode(ib) ↓ ubifs_jnl_write_inode ABBA deadlock ←-----make_reservation(BASEHD) dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&ixa->i_state) Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING to pin the inode in memory while inode_lru_isolate() reclaims its pages instead of using ordinary inode reference. This way inode deletion cannot be triggered from inode_lru_isolate() thus avoiding the deadlock. evict() is made to wait for I_LRU_ISOLATING to be cleared before proceeding with inode cleanup. Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022 Fixes: e50e5129f384 ("ext4: xattr-in-inode support") Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files") Cc: stable@vger.kernel.org Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240809031628.1069873-1-chengzhihao@huaweicloud.com Reviewed-by: Jan Kara Suggested-by: Jan Kara Suggested-by: Mateusz Guzik Signed-off-by: Christian Brauner --- fs/inode.c | 39 +++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 5 +++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 86670941884b..10c4619faeef 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -488,6 +488,39 @@ static void inode_lru_list_del(struct inode *inode) this_cpu_dec(nr_unused); } +static void inode_pin_lru_isolating(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode->i_state |= I_LRU_ISOLATING; +} + +static void inode_unpin_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); + inode->i_state &= ~I_LRU_ISOLATING; + smp_mb(); + wake_up_bit(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); +} + +static void inode_wait_for_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_state & I_LRU_ISOLATING) { + DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); + __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_LRU_ISOLATING); + } + spin_unlock(&inode->i_lock); +} + /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add @@ -657,6 +690,8 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + inode_wait_for_lru_isolating(inode); + /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since @@ -855,7 +890,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * be under pressure before the cache inside the highmem zone. */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { - __iget(inode); + inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); if (remove_inode_buffers(inode)) { @@ -867,7 +902,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, __count_vm_events(PGINODESTEAL, reap); mm_account_reclaimed_pages(reap); } - iput(inode); + inode_unpin_lru_isolating(inode); spin_lock(lru_lock); return LRU_RETRY; } diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..fb0426f349fc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) #define I_PINNING_NETFS_WB (1 << 18) +#define __I_LRU_ISOLATING 19 +#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) From e3786b29c54cdae3490b07180a54e2461f42144c Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Thu, 8 Aug 2024 14:29:38 +0100 Subject: [PATCH 0812/1052] 9p: Fix DIO read through netfs If a program is watching a file on a 9p mount, it won't see any change in size if the file being exported by the server is changed directly in the source filesystem, presumably because 9p doesn't have change notifications, and because netfs skips the reads if the file is empty. Fix this by attempting to read the full size specified when a DIO read is requested (such as when 9p is operating in unbuffered mode) and dealing with a short read if the EOF was less than the expected read. To make this work, filesystems using netfslib must not set NETFS_SREQ_CLEAR_TAIL if performing a DIO read where that read hit the EOF. I don't want to mandatorily clear this flag in netfslib for DIO because, say, ceph might make a read from an object that is not completely filled, but does not reside at the end of file - and so we need to clear the excess. This can be tested by watching an empty file over 9p within a VM (such as in the ktest framework): while true; do read content; if [ -n "$content" ]; then echo $content; break; fi; done < /host/tmp/foo then writing something into the empty file. The watcher should immediately display the file content and break out of the loop. Without this fix, it remains in the loop indefinitely. Fixes: 80105ed2fd27 ("9p: Use netfslib read/write_iter") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218916 Signed-off-by: David Howells Link: https://lore.kernel.org/r/1229195.1723211769@warthog.procyon.org.uk cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Marc Dionne cc: Ilya Dryomov cc: Steve French cc: Paulo Alcantara cc: Trond Myklebust cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Christian Brauner --- fs/9p/vfs_addr.c | 3 ++- fs/afs/file.c | 3 ++- fs/ceph/addr.c | 6 ++++-- fs/netfs/io.c | 17 +++++++++++------ fs/nfs/fscache.c | 3 ++- fs/smb/client/file.c | 3 ++- 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index a97ceb105cd8..24fdc74caeba 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -75,7 +75,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_subreq_terminated(subreq, err ?: total, false); } diff --git a/fs/afs/file.c b/fs/afs/file.c index c3f0c45ae9a9..ec1be0091fdb 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -242,7 +242,8 @@ static void afs_fetch_data_notify(struct afs_operation *op) req->error = error; if (subreq) { - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_subreq_terminated(subreq, error ?: req->actual_len, false); req->subreq = NULL; } else if (req->done) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index cc0a2240de98..c4744a02db75 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -246,7 +246,8 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err >= 0) { if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); - if (err < subreq->len) + if (err < subreq->len && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { err = ceph_fscrypt_decrypt_extents(inode, @@ -282,7 +283,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) size_t len; int mode; - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); if (subreq->start >= inode->i_size) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c179a1c73fa7..5367caf3fa28 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -530,7 +530,8 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, if (transferred_or_error == 0) { if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - subreq->error = -ENODATA; + if (rreq->origin != NETFS_DIO_READ) + subreq->error = -ENODATA; goto failed; } } else { @@ -601,9 +602,14 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, } if (subreq->len > ictx->zero_point - subreq->start) subreq->len = ictx->zero_point - subreq->start; + + /* We limit buffered reads to the EOF, but let the + * server deal with larger-than-EOF DIO/unbuffered + * reads. + */ + if (subreq->len > rreq->i_size - subreq->start) + subreq->len = rreq->i_size - subreq->start; } - if (subreq->len > rreq->i_size - subreq->start) - subreq->len = rreq->i_size - subreq->start; if (rreq->rsize && subreq->len > rreq->rsize) subreq->len = rreq->rsize; @@ -739,11 +745,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) do { _debug("submit %llx + %llx >= %llx", rreq->start, rreq->submitted, rreq->i_size); - if (rreq->origin == NETFS_DIO_READ && - rreq->start + rreq->submitted >= rreq->i_size) - break; if (!netfs_rreq_submit_slice(rreq, &io_iter)) break; + if (test_bit(NETFS_SREQ_NO_PROGRESS, &rreq->flags)) + break; if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) break; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index bf29a65c5027..7a558dea75c4 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -363,7 +363,8 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) return; sreq = netfs->sreq; - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); if (hdr->error) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index b2405dd4d4d4..3f3842e7b44a 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -217,7 +217,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) goto out; } - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); rc = rdata->server->ops->async_readv(rdata); out: From 810ee43d9cd245d138a2733d87a24858a23f577d Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 12 Aug 2024 00:28:21 +0100 Subject: [PATCH 0813/1052] Squashfs: sanity check symbolic link size Syzkiller reports a "KMSAN: uninit-value in pick_link" bug. This is caused by an uninitialised page, which is ultimately caused by a corrupted symbolic link size read from disk. The reason why the corrupted symlink size causes an uninitialised page is due to the following sequence of events: 1. squashfs_read_inode() is called to read the symbolic link from disk. This assigns the corrupted value 3875536935 to inode->i_size. 2. Later squashfs_symlink_read_folio() is called, which assigns this corrupted value to the length variable, which being a signed int, overflows producing a negative number. 3. The following loop that fills in the page contents checks that the copied bytes is less than length, which being negative means the loop is skipped, producing an uninitialised page. This patch adds a sanity check which checks that the symbolic link size is not larger than expected. -- Signed-off-by: Phillip Lougher Link: https://lore.kernel.org/r/20240811232821.13903-1-phillip@squashfs.org.uk Reported-by: Lizhi Xu Reported-by: syzbot+24ac24ff58dc5b0d26b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000a90e8c061e86a76b@google.com/ V2: fix spelling mistake. Signed-off-by: Christian Brauner --- fs/squashfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 16bd693d0b3a..d5918eba27e3 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -279,8 +279,13 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); + if (inode->i_size > PAGE_SIZE) { + ERROR("Corrupted symlink\n"); + return -EINVAL; + } + + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_op = &squashfs_symlink_inode_ops; inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; From e4956dc7a84da074fd8dc10f7abd147f15b3ae58 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Aug 2024 06:10:59 -0600 Subject: [PATCH 0814/1052] io_uring/sqpoll: annotate debug task == current with data_race() There's a debug check in io_sq_thread_park() checking if it's the SQPOLL thread itself calling park. KCSAN warns about this, as we should not be reading sqd->thread outside of sqd->lock. Just silence this with data_race(). The pointer isn't used for anything but this debug check. Reported-by: syzbot+2b946a3fd80caf971b21@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index b3722e5275e7..3b50dc9586d1 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -44,7 +44,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(data_race(sqd->thread) == current); atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); From bcc954c6caba01fca143162d5fbb90e46aa1ad80 Mon Sep 17 00:00:00 2001 From: Ryo Takakura Date: Mon, 12 Aug 2024 16:27:03 +0900 Subject: [PATCH 0815/1052] printk/panic: Allow cpu backtraces to be written into ringbuffer during panic commit 779dbc2e78d7 ("printk: Avoid non-panic CPUs writing to ringbuffer") disabled non-panic CPUs to further write messages to ringbuffer after panicked. Since the commit, non-panicked CPU's are not allowed to write to ring buffer after panicked and CPU backtrace which is triggered after panicked to sample non-panicked CPUs' backtrace no longer serves its function as it has nothing to print. Fix the issue by allowing non-panicked CPUs to write into ringbuffer while CPU backtrace is in flight. Fixes: 779dbc2e78d7 ("printk: Avoid non-panic CPUs writing to ringbuffer") Signed-off-by: Ryo Takakura Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240812072703.339690-1-takakura@valinux.co.jp Signed-off-by: Petr Mladek --- include/linux/panic.h | 1 + kernel/panic.c | 8 +++++++- kernel/printk/printk.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/panic.h b/include/linux/panic.h index 3130e0b5116b..54d90b6c5f47 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -16,6 +16,7 @@ extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); +extern bool panic_triggering_all_cpu_backtrace; extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; diff --git a/kernel/panic.c b/kernel/panic.c index f861bedc1925..2a0449144f82 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -64,6 +64,8 @@ unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +bool panic_triggering_all_cpu_backtrace; + int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -253,8 +255,12 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) + if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); + panic_triggering_all_cpu_backtrace = false; + } /* * Note that smp_send_stop() is the usual SMP shutdown function, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 054c0e7784fd..c22b07049c38 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2316,7 +2316,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; if (level == LOGLEVEL_SCHED) { From b098495e69491c2225681f43228312d32477217b Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 3 Aug 2024 19:32:33 +0800 Subject: [PATCH 0816/1052] KVM: x86: hyper-v: Remove unused inline function kvm_hv_free_pa_page() There is no caller in tree since introduction in commit b4f69df0f65e ("KVM: x86: Make Hyper-V emulation optional") Signed-off-by: Yue Haibing Message-ID: <20240803113233.128185-1-yuehaibing@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 923e64903da9..913bfc96959c 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -286,7 +286,6 @@ static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return HV_STATUS_ACCESS_DENIED; } static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {} -static inline void kvm_hv_free_pa_page(struct kvm *kvm) {} static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) { return false; From 6252690f7e1b173b86a4c27dfc046b351ab423e7 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 9 Aug 2024 16:54:22 +0900 Subject: [PATCH 0817/1052] btrfs: fix invalid mapping of extent xarray state In __extent_writepage_io(), we call btrfs_set_range_writeback() -> folio_start_writeback(), which clears PAGECACHE_TAG_DIRTY mark from the mapping xarray if the folio is not dirty. This worked fine before commit 97713b1a2ced ("btrfs: do not clear page dirty inside extent_write_locked_range()"). After the commit, however, the folio is still dirty at this point, so the mapping DIRTY tag is not cleared anymore. Then, __extent_writepage_io() calls btrfs_folio_clear_dirty() to clear the folio's dirty flag. That results in the page being unlocked with a "strange" state. The page is not PageDirty, but the mapping tag is set as PAGECACHE_TAG_DIRTY. This strange state looks like causing a hang with a call trace below when running fstests generic/091 on a null_blk device. It is waiting for a folio lock. While I don't have an exact relation between this hang and the strange state, fixing the state also fixes the hang. And, that state is worth fixing anyway. This commit reorders btrfs_folio_clear_dirty() and btrfs_set_range_writeback() in __extent_writepage_io(), so that the PAGECACHE_TAG_DIRTY tag is properly removed from the xarray. [464.274] task:fsx state:D stack:0 pid:3034 tgid:3034 ppid:2853 flags:0x00004002 [464.286] Call Trace: [464.291] [464.295] __schedule+0x10ed/0x6260 [464.301] ? __pfx___blk_flush_plug+0x10/0x10 [464.308] ? __submit_bio+0x37c/0x450 [464.314] ? __pfx___schedule+0x10/0x10 [464.321] ? lock_release+0x567/0x790 [464.327] ? __pfx_lock_acquire+0x10/0x10 [464.334] ? __pfx_lock_release+0x10/0x10 [464.340] ? __pfx_lock_acquire+0x10/0x10 [464.347] ? __pfx_lock_release+0x10/0x10 [464.353] ? do_raw_spin_lock+0x12e/0x270 [464.360] schedule+0xdf/0x3b0 [464.365] io_schedule+0x8f/0xf0 [464.371] folio_wait_bit_common+0x2ca/0x6d0 [464.378] ? folio_wait_bit_common+0x1cc/0x6d0 [464.385] ? __pfx_folio_wait_bit_common+0x10/0x10 [464.392] ? __pfx_filemap_get_folios_tag+0x10/0x10 [464.400] ? __pfx_wake_page_function+0x10/0x10 [464.407] ? __pfx___might_resched+0x10/0x10 [464.414] ? do_raw_spin_unlock+0x58/0x1f0 [464.420] extent_write_cache_pages+0xe49/0x1620 [btrfs] [464.428] ? lock_acquire+0x435/0x500 [464.435] ? __pfx_extent_write_cache_pages+0x10/0x10 [btrfs] [464.443] ? btrfs_do_write_iter+0x493/0x640 [btrfs] [464.451] ? orc_find.part.0+0x1d4/0x380 [464.457] ? __pfx_lock_release+0x10/0x10 [464.464] ? __pfx_lock_release+0x10/0x10 [464.471] ? btrfs_do_write_iter+0x493/0x640 [btrfs] [464.478] btrfs_writepages+0x1cc/0x460 [btrfs] [464.485] ? __pfx_btrfs_writepages+0x10/0x10 [btrfs] [464.493] ? is_bpf_text_address+0x6e/0x100 [464.500] ? kernel_text_address+0x145/0x160 [464.507] ? unwind_get_return_address+0x5e/0xa0 [464.514] ? arch_stack_walk+0xac/0x100 [464.521] do_writepages+0x176/0x780 [464.527] ? lock_release+0x567/0x790 [464.533] ? __pfx_do_writepages+0x10/0x10 [464.540] ? __pfx_lock_acquire+0x10/0x10 [464.546] ? __pfx_stack_trace_save+0x10/0x10 [464.553] ? do_raw_spin_lock+0x12e/0x270 [464.560] ? do_raw_spin_unlock+0x58/0x1f0 [464.566] ? _raw_spin_unlock+0x23/0x40 [464.573] ? wbc_attach_and_unlock_inode+0x3da/0x7d0 [464.580] filemap_fdatawrite_wbc+0x113/0x180 [464.587] ? prepare_pages.constprop.0+0x13c/0x5c0 [btrfs] [464.596] __filemap_fdatawrite_range+0xaf/0xf0 [464.603] ? __pfx___filemap_fdatawrite_range+0x10/0x10 [464.611] ? trace_irq_enable.constprop.0+0xce/0x110 [464.618] ? kasan_quarantine_put+0xd7/0x1e0 [464.625] btrfs_start_ordered_extent+0x46f/0x570 [btrfs] [464.633] ? __pfx_btrfs_start_ordered_extent+0x10/0x10 [btrfs] [464.642] ? __clear_extent_bit+0x2c0/0x9d0 [btrfs] [464.650] btrfs_lock_and_flush_ordered_range+0xc6/0x180 [btrfs] [464.659] ? __pfx_btrfs_lock_and_flush_ordered_range+0x10/0x10 [btrfs] [464.669] btrfs_read_folio+0x12a/0x1d0 [btrfs] [464.676] ? __pfx_btrfs_read_folio+0x10/0x10 [btrfs] [464.684] ? __pfx_filemap_add_folio+0x10/0x10 [464.691] ? __pfx___might_resched+0x10/0x10 [464.698] ? __filemap_get_folio+0x1c5/0x450 [464.705] prepare_uptodate_page+0x12e/0x4d0 [btrfs] [464.713] prepare_pages.constprop.0+0x13c/0x5c0 [btrfs] [464.721] ? fault_in_iov_iter_readable+0xd2/0x240 [464.729] btrfs_buffered_write+0x5bd/0x12f0 [btrfs] [464.737] ? __pfx_btrfs_buffered_write+0x10/0x10 [btrfs] [464.745] ? __pfx_lock_release+0x10/0x10 [464.752] ? generic_write_checks+0x275/0x400 [464.759] ? down_write+0x118/0x1f0 [464.765] ? up_write+0x19b/0x500 [464.770] btrfs_direct_write+0x731/0xba0 [btrfs] [464.778] ? __pfx_btrfs_direct_write+0x10/0x10 [btrfs] [464.785] ? __pfx___might_resched+0x10/0x10 [464.792] ? lock_acquire+0x435/0x500 [464.798] ? lock_acquire+0x435/0x500 [464.804] btrfs_do_write_iter+0x494/0x640 [btrfs] [464.811] ? __pfx_btrfs_do_write_iter+0x10/0x10 [btrfs] [464.819] ? __pfx___might_resched+0x10/0x10 [464.825] ? rw_verify_area+0x6d/0x590 [464.831] vfs_write+0x5d7/0xf50 [464.837] ? __might_fault+0x9d/0x120 [464.843] ? __pfx_vfs_write+0x10/0x10 [464.849] ? btrfs_file_llseek+0xb1/0xfb0 [btrfs] [464.856] ? lock_release+0x567/0x790 [464.862] ksys_write+0xfb/0x1d0 [464.867] ? __pfx_ksys_write+0x10/0x10 [464.873] ? _raw_spin_unlock+0x23/0x40 [464.879] ? btrfs_getattr+0x4af/0x670 [btrfs] [464.886] ? vfs_getattr_nosec+0x79/0x340 [464.892] do_syscall_64+0x95/0x180 [464.898] ? __do_sys_newfstat+0xde/0xf0 [464.904] ? __pfx___do_sys_newfstat+0x10/0x10 [464.911] ? trace_irq_enable.constprop.0+0xce/0x110 [464.918] ? syscall_exit_to_user_mode+0xac/0x2a0 [464.925] ? do_syscall_64+0xa1/0x180 [464.931] ? trace_irq_enable.constprop.0+0xce/0x110 [464.939] ? trace_irq_enable.constprop.0+0xce/0x110 [464.946] ? syscall_exit_to_user_mode+0xac/0x2a0 [464.953] ? btrfs_file_llseek+0xb1/0xfb0 [btrfs] [464.960] ? do_syscall_64+0xa1/0x180 [464.966] ? btrfs_file_llseek+0xb1/0xfb0 [btrfs] [464.973] ? trace_irq_enable.constprop.0+0xce/0x110 [464.980] ? syscall_exit_to_user_mode+0xac/0x2a0 [464.987] ? __pfx_btrfs_file_llseek+0x10/0x10 [btrfs] [464.995] ? trace_irq_enable.constprop.0+0xce/0x110 [465.002] ? __pfx_btrfs_file_llseek+0x10/0x10 [btrfs] [465.010] ? do_syscall_64+0xa1/0x180 [465.016] ? lock_release+0x567/0x790 [465.022] ? __pfx_lock_acquire+0x10/0x10 [465.028] ? __pfx_lock_release+0x10/0x10 [465.034] ? trace_irq_enable.constprop.0+0xce/0x110 [465.042] ? syscall_exit_to_user_mode+0xac/0x2a0 [465.049] ? do_syscall_64+0xa1/0x180 [465.055] ? syscall_exit_to_user_mode+0xac/0x2a0 [465.062] ? do_syscall_64+0xa1/0x180 [465.068] ? syscall_exit_to_user_mode+0xac/0x2a0 [465.075] ? do_syscall_64+0xa1/0x180 [465.081] ? clear_bhb_loop+0x25/0x80 [465.087] ? clear_bhb_loop+0x25/0x80 [465.093] ? clear_bhb_loop+0x25/0x80 [465.099] entry_SYSCALL_64_after_hwframe+0x76/0x7e [465.106] RIP: 0033:0x7f093b8ee784 [465.111] RSP: 002b:00007ffc29d31b28 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [465.122] RAX: ffffffffffffffda RBX: 0000000000006000 RCX: 00007f093b8ee784 [465.131] RDX: 000000000001de00 RSI: 00007f093b6ed200 RDI: 0000000000000003 [465.141] RBP: 000000000001de00 R08: 0000000000006000 R09: 0000000000000000 [465.150] R10: 0000000000023e00 R11: 0000000000000202 R12: 0000000000006000 [465.160] R13: 0000000000023e00 R14: 0000000000023e00 R15: 0000000000000001 [465.170] [465.174] INFO: lockdep is turned off. Reported-by: Shinichiro Kawasaki Fixes: 97713b1a2ced ("btrfs: do not clear page dirty inside extent_write_locked_range()") Reviewed-by: Qu Wenruo Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aa7f8148cd0d..c73cd4f89015 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1496,6 +1496,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, free_extent_map(em); em = NULL; + /* + * Although the PageDirty bit might be cleared before entering + * this function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * page for range already written to disk. + */ + btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); btrfs_set_range_writeback(inode, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(inode->root->fs_info, @@ -1503,13 +1510,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, page->index, cur, end); } - /* - * Although the PageDirty bit is cleared before entering this - * function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * page for range already written to disk. - */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, cur - page_offset(page)); From df934abb185c71c9f2fa07a5013672d0cbd36560 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Fri, 9 Aug 2024 12:36:12 -0400 Subject: [PATCH 0818/1052] mlxbf_gige: disable RX filters until RX path initialized A recent change to the driver exposed a bug where the MAC RX filters (unicast MAC, broadcast MAC, and multicast MAC) are configured and enabled before the RX path is fully initialized. The result of this bug is that after the PHY is started packets that match these MAC RX filters start to flow into the RX FIFO. And then, after rx_init() is completed, these packets will go into the driver RX ring as well. If enough packets are received to fill the RX ring (default size is 128 packets) before the call to request_irq() completes, the driver RX function becomes stuck. This bug is intermittent but is most likely to be seen where the oob_net0 interface is connected to a busy network with lots of broadcast and multicast traffic. All the MAC RX filters must be disabled until the RX path is ready, i.e. all initialization is done and all the IRQs are installed. Fixes: f7442a634ac0 ("mlxbf_gige: call request_irq() after NAPI initialized") Reviewed-by: Asmaa Mnebhi Signed-off-by: David Thompson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240809163612.12852-1-davthompson@nvidia.com Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlxbf_gige/mlxbf_gige.h | 8 +++ .../mellanox/mlxbf_gige/mlxbf_gige_main.c | 10 ++++ .../mellanox/mlxbf_gige/mlxbf_gige_regs.h | 2 + .../mellanox/mlxbf_gige/mlxbf_gige_rx.c | 50 ++++++++++++++++--- 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h index bc94e75a7aeb..e7777700ee18 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h @@ -40,6 +40,7 @@ */ #define MLXBF_GIGE_BCAST_MAC_FILTER_IDX 0 #define MLXBF_GIGE_LOCAL_MAC_FILTER_IDX 1 +#define MLXBF_GIGE_MAX_FILTER_IDX 3 /* Define for broadcast MAC literal */ #define BCAST_MAC_ADDR 0xFFFFFFFFFFFF @@ -175,6 +176,13 @@ enum mlxbf_gige_res { int mlxbf_gige_mdio_probe(struct platform_device *pdev, struct mlxbf_gige *priv); void mlxbf_gige_mdio_remove(struct mlxbf_gige *priv); + +void mlxbf_gige_enable_multicast_rx(struct mlxbf_gige *priv); +void mlxbf_gige_disable_multicast_rx(struct mlxbf_gige *priv); +void mlxbf_gige_enable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index); +void mlxbf_gige_disable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index); void mlxbf_gige_set_mac_rx_filter(struct mlxbf_gige *priv, unsigned int index, u64 dmac); void mlxbf_gige_get_mac_rx_filter(struct mlxbf_gige *priv, diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index b157f0f1c5a8..385a56ac7348 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -168,6 +168,10 @@ static int mlxbf_gige_open(struct net_device *netdev) if (err) goto napi_deinit; + mlxbf_gige_enable_mac_rx_filter(priv, MLXBF_GIGE_BCAST_MAC_FILTER_IDX); + mlxbf_gige_enable_mac_rx_filter(priv, MLXBF_GIGE_LOCAL_MAC_FILTER_IDX); + mlxbf_gige_enable_multicast_rx(priv); + /* Set bits in INT_EN that we care about */ int_en = MLXBF_GIGE_INT_EN_HW_ACCESS_ERROR | MLXBF_GIGE_INT_EN_TX_CHECKSUM_INPUTS | @@ -379,6 +383,7 @@ static int mlxbf_gige_probe(struct platform_device *pdev) void __iomem *plu_base; void __iomem *base; int addr, phy_irq; + unsigned int i; int err; base = devm_platform_ioremap_resource(pdev, MLXBF_GIGE_RES_MAC); @@ -423,6 +428,11 @@ static int mlxbf_gige_probe(struct platform_device *pdev) priv->rx_q_entries = MLXBF_GIGE_DEFAULT_RXQ_SZ; priv->tx_q_entries = MLXBF_GIGE_DEFAULT_TXQ_SZ; + for (i = 0; i <= MLXBF_GIGE_MAX_FILTER_IDX; i++) + mlxbf_gige_disable_mac_rx_filter(priv, i); + mlxbf_gige_disable_multicast_rx(priv); + mlxbf_gige_disable_promisc(priv); + /* Write initial MAC address to hardware */ mlxbf_gige_initial_mac(priv); diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h index 98a8681c21b9..4d14cb13fd64 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h @@ -62,6 +62,8 @@ #define MLXBF_GIGE_TX_STATUS_DATA_FIFO_FULL BIT(1) #define MLXBF_GIGE_RX_MAC_FILTER_DMAC_RANGE_START 0x0520 #define MLXBF_GIGE_RX_MAC_FILTER_DMAC_RANGE_END 0x0528 +#define MLXBF_GIGE_RX_MAC_FILTER_GENERAL 0x0530 +#define MLXBF_GIGE_RX_MAC_FILTER_EN_MULTICAST BIT(1) #define MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC 0x0540 #define MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC_EN BIT(0) #define MLXBF_GIGE_RX_MAC_FILTER_COUNT_PASS 0x0548 diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c index 699984358493..eb62620b63c7 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c @@ -11,22 +11,60 @@ #include "mlxbf_gige.h" #include "mlxbf_gige_regs.h" -void mlxbf_gige_set_mac_rx_filter(struct mlxbf_gige *priv, - unsigned int index, u64 dmac) +void mlxbf_gige_enable_multicast_rx(struct mlxbf_gige *priv) +{ + void __iomem *base = priv->base; + u64 data; + + data = readq(base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); + data |= MLXBF_GIGE_RX_MAC_FILTER_EN_MULTICAST; + writeq(data, base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); +} + +void mlxbf_gige_disable_multicast_rx(struct mlxbf_gige *priv) +{ + void __iomem *base = priv->base; + u64 data; + + data = readq(base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); + data &= ~MLXBF_GIGE_RX_MAC_FILTER_EN_MULTICAST; + writeq(data, base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); +} + +void mlxbf_gige_enable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index) { void __iomem *base = priv->base; u64 control; - /* Write destination MAC to specified MAC RX filter */ - writeq(dmac, base + MLXBF_GIGE_RX_MAC_FILTER + - (index * MLXBF_GIGE_RX_MAC_FILTER_STRIDE)); - /* Enable MAC receive filter mask for specified index */ control = readq(base + MLXBF_GIGE_CONTROL); control |= (MLXBF_GIGE_CONTROL_EN_SPECIFIC_MAC << index); writeq(control, base + MLXBF_GIGE_CONTROL); } +void mlxbf_gige_disable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index) +{ + void __iomem *base = priv->base; + u64 control; + + /* Disable MAC receive filter mask for specified index */ + control = readq(base + MLXBF_GIGE_CONTROL); + control &= ~(MLXBF_GIGE_CONTROL_EN_SPECIFIC_MAC << index); + writeq(control, base + MLXBF_GIGE_CONTROL); +} + +void mlxbf_gige_set_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index, u64 dmac) +{ + void __iomem *base = priv->base; + + /* Write destination MAC to specified MAC RX filter */ + writeq(dmac, base + MLXBF_GIGE_RX_MAC_FILTER + + (index * MLXBF_GIGE_RX_MAC_FILTER_STRIDE)); +} + void mlxbf_gige_get_mac_rx_filter(struct mlxbf_gige *priv, unsigned int index, u64 *dmac) { From 15e1c3d65975524c5c792fcd59f7d89f00402261 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 2 Aug 2024 13:16:30 -0700 Subject: [PATCH 0819/1052] KVM: x86: Use this_cpu_ptr() instead of per_cpu_ptr(smp_processor_id()) Use this_cpu_ptr() instead of open coding the equivalent in various user return MSR helpers. Signed-off-by: Isaku Yamahata Reviewed-by: Chao Gao Reviewed-by: Yuan Yao [sean: massage changelog] Signed-off-by: Sean Christopherson Reviewed-by: Pankaj Gupta Message-ID: <20240802201630.339306-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ef3d3511e4af..70219e406987 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -427,8 +427,7 @@ static void kvm_user_return_msr_cpu_online(void) int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); @@ -450,8 +449,7 @@ EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); if (msrs->registered) kvm_on_user_return(&msrs->urn); From 86cfa9a85fb04fa61e7c6b5a8ecf812437cdad78 Mon Sep 17 00:00:00 2001 From: Daniel Yang Date: Wed, 7 Aug 2024 02:01:21 -0700 Subject: [PATCH 0820/1052] Documentation: dm-crypt.rst warning + error fix While building kernel documention using make htmldocs command, I was getting unexpected indentation error. Single description was given for two module parameters with wrong indentation. So, I corrected the indentation of both parameters and the description. Signed-off-by: Shibu kumar Signed-off-by: Daniel Yang Signed-off-by: Mikulas Patocka Fixes: 0d815e3400e6 ("dm-crypt: limit the size of encryption requests") --- .../admin-guide/device-mapper/dm-crypt.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index e625830d335e..552c9155165d 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -162,13 +162,14 @@ iv_large_sectors Module parameters:: -max_read_size -max_write_size - Maximum size of read or write requests. When a request larger than this size - is received, dm-crypt will split the request. The splitting improves - concurrency (the split requests could be encrypted in parallel by multiple - cores), but it also causes overhead. The user should tune these parameters to - fit the actual workload. + + max_read_size + max_write_size + Maximum size of read or write requests. When a request larger than this size + is received, dm-crypt will split the request. The splitting improves + concurrency (the split requests could be encrypted in parallel by multiple + cores), but it also causes overhead. The user should tune these parameters to + fit the actual workload. Example scripts From 4b7c3f6d04bd53f2e5b228b6821fb8f5d1ba3071 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:29:40 -0700 Subject: [PATCH 0821/1052] KVM: x86: Make x2APIC ID 100% readonly Ignore the userspace provided x2APIC ID when fixing up APIC state for KVM_SET_LAPIC, i.e. make the x2APIC fully readonly in KVM. Commit a92e2543d6a8 ("KVM: x86: use hardware-compatible format for APIC ID register"), which added the fixup, didn't intend to allow userspace to modify the x2APIC ID. In fact, that commit is when KVM first started treating the x2APIC ID as readonly, apparently to fix some race: static inline u32 kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; + /* To avoid a race between apic_base and following APIC_ID update when + * switching to x2apic_mode, the x2apic mode returns initial x2apic id. + */ + if (apic_x2apic_mode(apic)) + return apic->vcpu->vcpu_id; + + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } Furthermore, KVM doesn't support delivering interrupts to vCPUs with a modified x2APIC ID, but KVM *does* return the modified value on a guest RDMSR and for KVM_GET_LAPIC. I.e. no remotely sane setup can actually work with a modified x2APIC ID. Making the x2APIC ID fully readonly fixes a WARN in KVM's optimized map calculation, which expects the LDR to align with the x2APIC ID. WARNING: CPU: 2 PID: 958 at arch/x86/kvm/lapic.c:331 kvm_recalculate_apic_map+0x609/0xa00 [kvm] CPU: 2 PID: 958 Comm: recalc_apic_map Not tainted 6.4.0-rc3-vanilla+ #35 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.2-1-1 04/01/2014 RIP: 0010:kvm_recalculate_apic_map+0x609/0xa00 [kvm] Call Trace: kvm_apic_set_state+0x1cf/0x5b0 [kvm] kvm_arch_vcpu_ioctl+0x1806/0x2100 [kvm] kvm_vcpu_ioctl+0x663/0x8a0 [kvm] __x64_sys_ioctl+0xb8/0xf0 do_syscall_64+0x56/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fade8b9dd6f Unfortunately, the WARN can still trigger for other CPUs than the current one by racing against KVM_SET_LAPIC, so remove it completely. Reported-by: Michal Luczaj Closes: https://lore.kernel.org/all/814baa0c-1eaa-4503-129f-059917365e80@rbox.co Reported-by: Haoyu Wu Closes: https://lore.kernel.org/all/20240126161633.62529-1-haoyuwu254@gmail.com Reported-by: syzbot+545f1326f405db4e1c3e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000c2a6b9061cbca3c3@google.com Signed-off-by: Sean Christopherson Message-ID: <20240802202941.344889-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4915acdbfcd8..5bb481aefcbc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -351,10 +351,8 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new, * reversing the LDR calculation to get cluster of APICs, i.e. no * additional work is required. */ - if (apic_x2apic_mode(apic)) { - WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic))); + if (apic_x2apic_mode(apic)) return; - } if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))) { @@ -2966,18 +2964,28 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) { if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic); u32 *id = (u32 *)(s->regs + APIC_ID); u32 *ldr = (u32 *)(s->regs + APIC_LDR); u64 icr; if (vcpu->kvm->arch.x2apic_format) { - if (*id != vcpu->vcpu_id) + if (*id != x2apic_id) return -EINVAL; } else { + /* + * Ignore the userspace value when setting APIC state. + * KVM's model is that the x2APIC ID is readonly, e.g. + * KVM only supports delivering interrupts to KVM's + * version of the x2APIC ID. However, for backwards + * compatibility, don't reject attempts to set a + * mismatched ID for userspace that hasn't opted into + * x2apic_format. + */ if (set) - *id >>= 24; + *id = x2apic_id; else - *id <<= 24; + *id = x2apic_id << 24; } /* @@ -2986,7 +2994,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, * split to ICR+ICR2 in userspace for backwards compatibility. */ if (set) { - *ldr = kvm_apic_calc_x2apic_ldr(*id); + *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; From 238d3d63d1e27c8d9733b48f7b682fc6aba86672 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 2 Aug 2024 13:29:41 -0700 Subject: [PATCH 0822/1052] KVM: selftests: Add a testcase to verify x2APIC is fully readonly Add a test to verify that userspace can't change a vCPU's x2APIC ID by abusing KVM_SET_LAPIC. KVM models the x2APIC ID (and x2APIC LDR) as readonly, and silently ignores userspace attempts to change the x2APIC ID for backwards compatibility. Signed-off-by: Michal Luczaj [sean: write changelog, add to existing test] Signed-off-by: Sean Christopherson Message-ID: <20240802202941.344889-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/xapic_state_test.c | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 69849acd95b0..618cd2442390 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -184,6 +184,33 @@ static void test_apic_id(void) kvm_vm_free(vm); } +static void test_x2apic_id(void) +{ + struct kvm_lapic_state lapic = {}; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + vcpu_set_msr(vcpu, MSR_IA32_APICBASE, MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + + /* + * Try stuffing a modified x2APIC ID, KVM should ignore the value and + * always return the vCPU's default/readonly x2APIC ID. + */ + for (i = 0; i <= 0xff; i++) { + *(u32 *)(lapic.regs + APIC_ID) = i << 24; + *(u32 *)(lapic.regs + APIC_SPIV) = APIC_SPIV_APIC_ENABLED; + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic); + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &lapic); + TEST_ASSERT(*((u32 *)&lapic.regs[APIC_ID]) == vcpu->id << 24, + "x2APIC ID should be fully readonly"); + } + + kvm_vm_free(vm); +} + int main(int argc, char *argv[]) { struct xapic_vcpu x = { @@ -211,4 +238,5 @@ int main(int argc, char *argv[]) kvm_vm_free(vm); test_apic_id(); + test_x2apic_id(); } From c9b35a6f4edea698a5bb4dd8029e7104ee0a3726 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 11 Jul 2024 20:11:30 +0800 Subject: [PATCH 0823/1052] KVM: eventfd: Use synchronize_srcu_expedited() on shutdown When hot-unplug a device which has many queues, and guest CPU will has huge jitter, and unplugging is very slow. It turns out synchronize_srcu() in irqfd_shutdown() caused the guest jitter and unplugging latency, so replace synchronize_srcu() with synchronize_srcu_expedited(), to accelerate the unplugging, and reduce the guest OS jitter, this accelerates the VM reboot too. Signed-off-by: Li RongQing Message-ID: <20240711121130.38917-1-lirongqing@baidu.com> [Call it just once in irqfd_resampler_shutdown. - Paolo] Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 229570059a1b..992f9beb3e7d 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -97,18 +97,19 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) mutex_lock(&kvm->irqfds.resampler_lock); list_del_rcu(&irqfd->resampler_link); - synchronize_srcu(&kvm->irq_srcu); if (list_empty(&resampler->list)) { list_del_rcu(&resampler->link); kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); /* - * synchronize_srcu(&kvm->irq_srcu) already called + * synchronize_srcu_expedited(&kvm->irq_srcu) already called * in kvm_unregister_irq_ack_notifier(). */ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, resampler->notifier.gsi, 0, false); kfree(resampler); + } else { + synchronize_srcu_expedited(&kvm->irq_srcu); } mutex_unlock(&kvm->irqfds.resampler_lock); @@ -126,7 +127,7 @@ irqfd_shutdown(struct work_struct *work) u64 cnt; /* Make sure irqfd has been initialized in assign path. */ - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); /* * Synchronize with the wait-queue and unhook ourselves to prevent @@ -384,7 +385,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); mutex_unlock(&kvm->irqfds.resampler_lock); } @@ -523,7 +524,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); kvm_arch_post_irq_ack_notifier_list_update(kvm); } @@ -608,7 +609,7 @@ kvm_irqfd_release(struct kvm *kvm) /* * Take note of a change in irq routing. - * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. + * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. */ void kvm_irq_routing_update(struct kvm *kvm) { From 11752c013f562a1124088a35bd314aa0e9f0e88f Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Thu, 18 Jul 2024 16:38:50 +0800 Subject: [PATCH 0824/1052] drm/amdgpu/mes: fix mes ring buffer overflow wait memory room until enough before writing mes packets to avoid ring buffer overflow. v2: squash in sched_hw_submission fix Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission") Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission") Signed-off-by: Jack Xiao Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 ++ drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++---- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++---- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index ad49cecb20b8..e6344a6b0a9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -212,6 +212,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, */ if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ) sched_hw_submission = max(sched_hw_submission, 256); + if (ring->funcs->type == AMDGPU_RING_TYPE_MES) + sched_hw_submission = 8; else if (ring == &adev->sdma.instance[0].page) sched_hw_submission = 256; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index f9343642ae7e..1a5ad5be33bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, const char *op_str, *misc_op_str; unsigned long flags; u64 status_gpu_addr; - u32 status_offset; + u32 seq, status_offset; u64 *status_ptr; signed long r; int ret; @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, if (r) goto error_unlock_free; + seq = ++ring->fence_drv.sync_seq; + r = amdgpu_fence_wait_polling(ring, + seq - ring->fence_drv.num_fences_mask, + timeout); + if (r < 1) + goto error_undo; + api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off); api_status->api_completion_fence_addr = status_gpu_addr; api_status->api_completion_fence_value = 1; @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_status_pkt.api_status.api_completion_fence_addr = ring->fence_drv.gpu_addr; - mes_status_pkt.api_status.api_completion_fence_value = - ++ring->fence_drv.sync_seq; + mes_status_pkt.api_status.api_completion_fence_value = seq; amdgpu_ring_write_multiple(ring, &mes_status_pkt, sizeof(mes_status_pkt) / 4); @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout); + r = amdgpu_fence_wait_polling(ring, seq, timeout); if (r < 1 || !*status_ptr) { if (misc_op_str) @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_device_wb_free(adev, status_offset); return 0; +error_undo: + dev_err(adev->dev, "MES ring buffer is full.\n"); + amdgpu_ring_undo(ring); + error_unlock_free: spin_unlock_irqrestore(&mes->ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 0713bc3eb263..249e5a66205c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, const char *op_str, *misc_op_str; unsigned long flags; u64 status_gpu_addr; - u32 status_offset; + u32 seq, status_offset; u64 *status_ptr; signed long r; int ret; @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, if (r) goto error_unlock_free; + seq = ++ring->fence_drv.sync_seq; + r = amdgpu_fence_wait_polling(ring, + seq - ring->fence_drv.num_fences_mask, + timeout); + if (r < 1) + goto error_undo; + api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off); api_status->api_completion_fence_addr = status_gpu_addr; api_status->api_completion_fence_value = 1; @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_status_pkt.api_status.api_completion_fence_addr = ring->fence_drv.gpu_addr; - mes_status_pkt.api_status.api_completion_fence_value = - ++ring->fence_drv.sync_seq; + mes_status_pkt.api_status.api_completion_fence_value = seq; amdgpu_ring_write_multiple(ring, &mes_status_pkt, sizeof(mes_status_pkt) / 4); @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout); + r = amdgpu_fence_wait_polling(ring, seq, timeout); if (r < 1 || !*status_ptr) { if (misc_op_str) @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_device_wb_free(adev, status_offset); return 0; +error_undo: + dev_err(adev->dev, "MES ring buffer is full.\n"); + amdgpu_ring_undo(ring); + error_unlock_free: spin_unlock_irqrestore(&mes->ring_lock, flags); From f6098641d3e1e4d4052ff9378857c831f9675f6b Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Tue, 6 Aug 2024 09:55:55 -0400 Subject: [PATCH 0825/1052] drm/amd/display: fix s2idle entry for DCN3.5+ To be able to get to the lowest power state when suspending systems with DCN3.5+, we must be in IPS before the display hardware is put into D3cold. So, to ensure that the system always reaches the lowest power state while suspending, force systems that support IPS to enter idle optimizations before entering D3cold. Reviewed-by: Roman Li Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher (cherry picked from commit 237193e21b29d4aa0617ffeea3d6f49e72999708) Cc: stable@vger.kernel.org # 6.10+ --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7e7929f24ae4..983a977632ff 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2893,6 +2893,9 @@ static int dm_suspend(void *handle) hpd_rx_irq_work_suspend(dm); + if (adev->dm.dc->caps.ips_support) + dc_allow_idle_optimizations(adev->dm.dc, true); + dc_set_power_state(dm->dc, DC_ACPI_CM_POWER_STATE_D3); dc_dmub_srv_set_power_state(dm->dc->ctx->dmub_srv, DC_ACPI_CM_POWER_STATE_D3); From 0dbb81d44108a2a1004e5b485ef3fca5bc078424 Mon Sep 17 00:00:00 2001 From: Loan Chen Date: Fri, 2 Aug 2024 13:57:40 +0800 Subject: [PATCH 0826/1052] drm/amd/display: Enable otg synchronization logic for DCN321 [Why] Tiled display cannot synchronize properly after S3. The fix for commit 5f0c74915815 ("drm/amd/display: Fix for otg synchronization logic") is not enable in DCN321, which causes the otg is excluded from synchronization. [How] Enable otg synchronization logic in dcn321. Fixes: 5f0c74915815 ("drm/amd/display: Fix for otg synchronization logic") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Alvin Lee Signed-off-by: Loan Chen Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit d6ed53712f583423db61fbb802606759e023bf7b) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c index 9a3cc0514a36..8e0588b1cf30 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c @@ -1778,6 +1778,9 @@ static bool dcn321_resource_construct( dc->caps.color.mpc.ogam_rom_caps.hlg = 0; dc->caps.color.mpc.ocsc = 1; + /* Use pipe context based otg sync logic */ + dc->config.use_pipe_ctx_sync_logic = true; + dc->config.dc_mode_clk_limit_support = true; dc->config.enable_windowed_mpo_odm = true; /* read VBIOS LTTPR caps */ From 338567d17627064dba63cf063459605e782f71d2 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Mon, 29 Jul 2024 10:23:03 -0400 Subject: [PATCH 0827/1052] drm/amd/display: Fix MST BW calculation Regression [Why & How] Revert commit 8b2cb32cf0c6 ("drm/amd/display: FEC overhead should be checked once for mst slot nums") Because causes bw calculation regression Cc: mario.limonciello@amd.com Cc: alexander.deucher@amd.com Reported-by: jirislaby@kernel.org Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3495 Closes: https://bugzilla.suse.com/show_bug.cgi?id=1228093 Reviewed-by: Wayne Lin Signed-off-by: Fangzhi Zuo Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 12dbb3ed212fc7655fce421542a5add637f8af7a) Cc: stable@vger.kernel.org --- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 33 ++++++++++++++----- .../display/amdgpu_dm/amdgpu_dm_mst_types.h | 3 ++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 915eb2c08ece..2e9f6da1acdc 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -804,12 +804,25 @@ struct dsc_mst_fairness_params { }; #if defined(CONFIG_DRM_AMD_DC_FP) -static int kbps_to_peak_pbn(int kbps) +static uint16_t get_fec_overhead_multiplier(struct dc_link *dc_link) +{ + u8 link_coding_cap; + uint16_t fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B; + + link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(dc_link); + if (link_coding_cap == DP_128b_132b_ENCODING) + fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B; + + return fec_overhead_multiplier_x1000; +} + +static int kbps_to_peak_pbn(int kbps, uint16_t fec_overhead_multiplier_x1000) { u64 peak_kbps = kbps; peak_kbps *= 1006; - peak_kbps = div_u64(peak_kbps, 1000); + peak_kbps *= fec_overhead_multiplier_x1000; + peak_kbps = div_u64(peak_kbps, 1000 * 1000); return (int) DIV64_U64_ROUND_UP(peak_kbps * 64, (54 * 8 * 1000)); } @@ -910,11 +923,12 @@ static int increase_dsc_bpp(struct drm_atomic_state *state, int link_timeslots_used; int fair_pbn_alloc; int ret = 0; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled) { initial_slack[i] = - kbps_to_peak_pbn(params[i].bw_range.max_kbps) - vars[i + k].pbn; + kbps_to_peak_pbn(params[i].bw_range.max_kbps, fec_overhead_multiplier_x1000) - vars[i + k].pbn; bpp_increased[i] = false; remaining_to_increase += 1; } else { @@ -1010,6 +1024,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, int next_index; int remaining_to_try = 0; int ret; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled @@ -1039,7 +1054,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, if (next_index == -1) break; - vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps); + vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -1052,8 +1067,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, vars[next_index].dsc_enabled = false; vars[next_index].bpp_x16 = 0; } else { - vars[next_index].pbn = kbps_to_peak_pbn( - params[next_index].bw_range.max_kbps); + vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -1082,6 +1096,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, int count = 0; int i, k, ret; bool debugfs_overwrite = false; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); memset(params, 0, sizeof(params)); @@ -1146,7 +1161,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, /* Try no compression */ for (i = 0; i < count; i++) { vars[i + k].aconnector = params[i].aconnector; - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, params[i].port, @@ -1165,7 +1180,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, /* Try max compression */ for (i = 0; i < count; i++) { if (params[i].compression_possible && params[i].clock_force_enable != DSC_CLK_FORCE_DISABLE) { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = true; vars[i + k].bpp_x16 = params[i].bw_range.min_target_bpp_x16; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, @@ -1173,7 +1188,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, if (ret < 0) return ret; } else { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h index fa84d34b7373..600d6e221011 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h @@ -46,6 +46,9 @@ #define SYNAPTICS_CASCADED_HUB_ID 0x5A #define IS_SYNAPTICS_CASCADED_PANAMERA(devName, data) ((IS_SYNAPTICS_PANAMERA(devName) && ((int)data[2] == SYNAPTICS_CASCADED_HUB_ID)) ? 1 : 0) +#define PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B 1031 +#define PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B 1000 + enum mst_msg_ready_type { NONE_MSG_RDY_EVENT = 0, DOWN_REP_MSG_RDY_EVENT = 1, From 737222cebecbdbcdde2b69475c52bcb9ecfeb830 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 31 Jan 2023 15:05:46 -0100 Subject: [PATCH 0828/1052] drm/amd/display: fix cursor offset on rotation 180 [why & how] Cursor gets clipped off in the middle of the screen with hw rotation 180. Fix a miscalculation of cursor offset when it's placed near the edges in the pipe split case. Cursor bugs with hw rotation were reported on AMD issue tracker: https://gitlab.freedesktop.org/drm/amd/-/issues/2247 The issues on rotation 270 was fixed by: https://lore.kernel.org/amd-gfx/20221118125935.4013669-22-Brian.Chang@amd.com/ that partially addressed the rotation 180 too. So, this patch is the final bits for rotation 180. Reported-by: Xaver Hugl Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2247 Reviewed-by: Harry Wentland Fixes: 9d84c7ef8a87 ("drm/amd/display: Correct cursor position on horizontal mirror") Signed-off-by: Melissa Wen Signed-off-by: Hamza Mahfooz Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 1fd2cf090096af8a25bf85564341cfc21cec659d) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c index ff03b1d98aa7..1b9ac8812f5b 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c @@ -3589,7 +3589,7 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) (int)hubp->curs_attr.width || pos_cpy.x <= (int)hubp->curs_attr.width + pipe_ctx->plane_state->src_rect.x) { - pos_cpy.x = temp_x + viewport_width; + pos_cpy.x = 2 * viewport_width - temp_x; } } } else { From 56fb276d0244d430496f249335a44ae114dd5f54 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 1 Aug 2024 16:16:35 -0600 Subject: [PATCH 0829/1052] drm/amd/display: Adjust cursor position [why & how] When the commit 9d84c7ef8a87 ("drm/amd/display: Correct cursor position on horizontal mirror") was introduced, it used the wrong calculation for the position copy for X. This commit uses the correct calculation for that based on the original patch. Fixes: 9d84c7ef8a87 ("drm/amd/display: Correct cursor position on horizontal mirror") Cc: Mario Limonciello Cc: Alex Deucher Acked-by: Wayne Lin Signed-off-by: Rodrigo Siqueira Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 8f9b23abbae5ffcd64856facd26a86b67195bc2f) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c index 1b9ac8812f5b..14a902ff3b8a 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c @@ -3682,7 +3682,7 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) (int)hubp->curs_attr.width || pos_cpy.x <= (int)hubp->curs_attr.width + pipe_ctx->plane_state->src_rect.x) { - pos_cpy.x = 2 * viewport_width - temp_x; + pos_cpy.x = temp_x + viewport_width; } } } else { From e414a304f2c5368a84f03ad34d29b89f965a33c9 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Jul 2024 10:00:33 -0400 Subject: [PATCH 0830/1052] drm/amdgpu/jpeg2: properly set atomics vmid field This needs to be set as well if the IB uses atomics. Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit 35c628774e50b3784c59e8ca7973f03bcb067132) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c index 99adf3625657..98aa3ccd0d20 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c @@ -538,11 +538,11 @@ void jpeg_v2_0_dec_ring_emit_ib(struct amdgpu_ring *ring, amdgpu_ring_write(ring, PACKETJ(mmUVD_LMI_JRBC_IB_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(mmUVD_LMI_JPEG_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(mmUVD_LMI_JRBC_IB_64BIT_BAR_LOW_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); From e6c6bd6253e792cee6c5c065e106e87b9f0d9ae9 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Jul 2024 10:06:05 -0400 Subject: [PATCH 0831/1052] drm/amdgpu/jpeg4: properly set atomics vmid field This needs to be set as well if the IB uses atomics. Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit c6c2e8b6a427d4fecc7c36cffccb908185afcab2) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index ad524ddc9760..f4662920c653 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -782,11 +782,11 @@ void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring, amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JRBC_IB_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JPEG_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JRBC_IB_64BIT_BAR_LOW_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); From 0573a1e2ea7e35bff08944a40f1adf2bb35cea61 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Tue, 6 Aug 2024 22:27:32 +0200 Subject: [PATCH 0832/1052] drm/amdgpu: Actually check flags for all context ops. Missing validation ... Checked libdrm and it clears all the structs, so we should be safe to just check everything. Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher (cherry picked from commit c6b86421f1f9ddf9d706f2453159813ee39d0cf9) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 5cb33ac99f70..c43d1b6e5d66 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -685,16 +685,24 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data, switch (args->in.op) { case AMDGPU_CTX_OP_ALLOC_CTX: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_alloc(adev, fpriv, filp, priority, &id); args->out.alloc.ctx_id = id; break; case AMDGPU_CTX_OP_FREE_CTX: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_free(fpriv, id); break; case AMDGPU_CTX_OP_QUERY_STATE: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_query(adev, fpriv, id, &args->out); break; case AMDGPU_CTX_OP_QUERY_STATE2: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_query2(adev, fpriv, id, &args->out); break; case AMDGPU_CTX_OP_GET_STABLE_PSTATE: From 278e1865b7a2124ea783b75ea8b3ee0bc2da5d85 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 11:43:45 +0800 Subject: [PATCH 0833/1052] drm/amdgpu/mes12: update mes_v12_api_def.h Update mes12 api definition. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 2ab5dc59177419d8a49e89585e82ff41524270fc) --- drivers/gpu/drm/amd/include/mes_v12_api_def.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/mes_v12_api_def.h b/drivers/gpu/drm/amd/include/mes_v12_api_def.h index 4cf2c9f30b3d..101e2fe962c6 100644 --- a/drivers/gpu/drm/amd/include/mes_v12_api_def.h +++ b/drivers/gpu/drm/amd/include/mes_v12_api_def.h @@ -97,6 +97,7 @@ enum MES_QUEUE_TYPE { MES_QUEUE_TYPE_SDMA, MES_QUEUE_TYPE_MAX, + MES_QUEUE_TYPE_SCHQ = MES_QUEUE_TYPE_MAX, }; struct MES_API_STATUS { @@ -242,8 +243,12 @@ union MESAPI_SET_HW_RESOURCES { uint32_t send_write_data : 1; uint32_t os_tdr_timeout_override : 1; uint32_t use_rs64mem_for_proc_gang_ctx : 1; + uint32_t halt_on_misaligned_access : 1; + uint32_t use_add_queue_unmap_flag_addr : 1; + uint32_t enable_mes_sch_stb_log : 1; + uint32_t limit_single_process : 1; uint32_t unmapped_doorbell_handling: 2; - uint32_t reserved : 15; + uint32_t reserved : 11; }; uint32_t uint32_all; }; From 2029b3d7e1358bcca30f74978543ba35b4bbc43d Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 11:53:35 +0800 Subject: [PATCH 0834/1052] drm/amdgpu/mes: add multiple mes ring instances support Add multiple mes ring instances in mes structure to support multiple mes pipes. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit c7d4355648ffa02a1551495b05c71ea6c884d29c) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 5 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 4 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 34 ++++++++++++------------ drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 34 ++++++++++++------------ 9 files changed, 47 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 82452606ae6c..f165b9d49e29 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -995,7 +995,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg, uint32_t xcc_ if (amdgpu_device_skip_hw_access(adev)) return 0; - if (adev->mes.ring.sched.ready) + if (adev->mes.ring[0].sched.ready) return amdgpu_mes_rreg(adev, reg); BUG_ON(!ring->funcs->emit_rreg); @@ -1065,7 +1065,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint3 if (amdgpu_device_skip_hw_access(adev)) return; - if (adev->mes.ring.sched.ready) { + if (adev->mes.ring[0].sched.ready) { amdgpu_mes_wreg(adev, reg, v); return; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index c02659025656..b49b3650fd62 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -589,7 +589,8 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev) ring = adev->rings[i]; vmhub = ring->vm_hub; - if (ring == &adev->mes.ring || + if (ring == &adev->mes.ring[0] || + ring == &adev->mes.ring[1] || ring == &adev->umsch_mm.ring) continue; @@ -761,7 +762,7 @@ void amdgpu_gmc_fw_reg_write_reg_wait(struct amdgpu_device *adev, unsigned long flags; uint32_t seq; - if (adev->mes.ring.sched.ready) { + if (adev->mes.ring[0].sched.ready) { amdgpu_mes_reg_write_reg_wait(adev, reg0, reg1, ref, mask); return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index dac88d2dd70d..8ef53f48ce15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -135,9 +135,11 @@ int amdgpu_mes_init(struct amdgpu_device *adev) idr_init(&adev->mes.queue_id_idr); ida_init(&adev->mes.doorbell_ida); spin_lock_init(&adev->mes.queue_id_lock); - spin_lock_init(&adev->mes.ring_lock); mutex_init(&adev->mes.mutex_hidden); + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) + spin_lock_init(&adev->mes.ring_lock[i]); + adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK; adev->mes.vmid_mask_mmhub = 0xffffff00; adev->mes.vmid_mask_gfxhub = 0xffffff00; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 2d659c612f03..f89e3f61fe46 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -82,8 +82,8 @@ struct amdgpu_mes { uint64_t default_process_quantum; uint64_t default_gang_quantum; - struct amdgpu_ring ring; - spinlock_t ring_lock; + struct amdgpu_ring ring[AMDGPU_MAX_MES_PIPES]; + spinlock_t ring_lock[AMDGPU_MAX_MES_PIPES]; const struct firmware *fw[AMDGPU_MAX_MES_PIPES]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 111c380f929b..b287a82e6177 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -858,7 +858,7 @@ void amdgpu_virt_post_reset(struct amdgpu_device *adev) adev->gfx.is_poweron = false; } - adev->mes.ring.sched.ready = false; + adev->mes.ring[0].sched.ready = false; } bool amdgpu_virt_fw_load_skip_check(struct amdgpu_device *adev, uint32_t ucode_id) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index b88a6fa173b3..2797fd84432b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -231,7 +231,7 @@ static void gmc_v11_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, /* This is necessary for SRIOV as well as for GFXOFF to function * properly under bare metal */ - if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) && + if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring[0].sched.ready) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) { amdgpu_gmc_fw_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid, GET_INST(GC, 0)); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index 26efce9aa410..edcb5351f8cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -299,7 +299,7 @@ static void gmc_v12_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, /* This is necessary for SRIOV as well as for GFXOFF to function * properly under bare metal */ - if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) && + if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring[0].sched.ready) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) { struct amdgpu_vmhub *hub = &adev->vmhub[vmhub]; const unsigned eng = 17; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 1a5ad5be33bf..44bdfa0b263a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -162,7 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, union MESAPI__QUERY_MES_STATUS mes_status_pkt; signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; - struct amdgpu_ring *ring = &mes->ring; + struct amdgpu_ring *ring = &mes->ring[0]; struct MES_API_STATUS *api_status; union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; @@ -191,7 +191,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, status_ptr = (u64 *)&adev->wb.wb[status_offset]; *status_ptr = 0; - spin_lock_irqsave(&mes->ring_lock, flags); + spin_lock_irqsave(&mes->ring_lock[0], flags); r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); if (r) goto error_unlock_free; @@ -221,7 +221,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, sizeof(mes_status_pkt) / 4); amdgpu_ring_commit(ring); - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); op_str = mes_v11_0_get_op_string(x_pkt); misc_op_str = mes_v11_0_get_misc_op_string(x_pkt); @@ -263,7 +263,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_ring_undo(ring); error_unlock_free: - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); error_wb_free: amdgpu_device_wb_free(adev, status_offset); @@ -1025,7 +1025,7 @@ static int mes_v11_0_kiq_enable_queue(struct amdgpu_device *adev) return r; } - kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring); + kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring[0]); return amdgpu_ring_test_helper(kiq_ring); } @@ -1039,7 +1039,7 @@ static int mes_v11_0_queue_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1081,7 +1081,7 @@ static int mes_v11_0_ring_init(struct amdgpu_device *adev) { struct amdgpu_ring *ring; - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; ring->funcs = &mes_v11_0_ring_funcs; @@ -1134,7 +1134,7 @@ static int mes_v11_0_mqd_sw_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1226,12 +1226,12 @@ static int mes_v11_0_sw_fini(void *handle) &adev->gfx.kiq[0].ring.mqd_gpu_addr, &adev->gfx.kiq[0].ring.mqd_ptr); - amdgpu_bo_free_kernel(&adev->mes.ring.mqd_obj, - &adev->mes.ring.mqd_gpu_addr, - &adev->mes.ring.mqd_ptr); + amdgpu_bo_free_kernel(&adev->mes.ring[0].mqd_obj, + &adev->mes.ring[0].mqd_gpu_addr, + &adev->mes.ring[0].mqd_ptr); amdgpu_ring_fini(&adev->gfx.kiq[0].ring); - amdgpu_ring_fini(&adev->mes.ring); + amdgpu_ring_fini(&adev->mes.ring[0]); if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { mes_v11_0_free_ucode_buffers(adev, AMDGPU_MES_KIQ_PIPE); @@ -1342,9 +1342,9 @@ static int mes_v11_0_kiq_hw_init(struct amdgpu_device *adev) static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev) { - if (adev->mes.ring.sched.ready) { - mes_v11_0_kiq_dequeue(&adev->mes.ring); - adev->mes.ring.sched.ready = false; + if (adev->mes.ring[0].sched.ready) { + mes_v11_0_kiq_dequeue(&adev->mes.ring[0]); + adev->mes.ring[0].sched.ready = false; } if (amdgpu_sriov_vf(adev)) { @@ -1362,7 +1362,7 @@ static int mes_v11_0_hw_init(void *handle) int r; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->mes.ring.sched.ready) + if (adev->mes.ring[0].sched.ready) goto out; if (!adev->enable_mes_kiq) { @@ -1407,7 +1407,7 @@ static int mes_v11_0_hw_init(void *handle) * with MES enabled. */ adev->gfx.kiq[0].ring.sched.ready = false; - adev->mes.ring.sched.ready = true; + adev->mes.ring[0].sched.ready = true; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 249e5a66205c..515de65d8aa0 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -148,7 +148,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, union MESAPI__QUERY_MES_STATUS mes_status_pkt; signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; - struct amdgpu_ring *ring = &mes->ring; + struct amdgpu_ring *ring = &mes->ring[0]; struct MES_API_STATUS *api_status; union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; @@ -177,7 +177,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, status_ptr = (u64 *)&adev->wb.wb[status_offset]; *status_ptr = 0; - spin_lock_irqsave(&mes->ring_lock, flags); + spin_lock_irqsave(&mes->ring_lock[0], flags); r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); if (r) goto error_unlock_free; @@ -207,7 +207,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, sizeof(mes_status_pkt) / 4); amdgpu_ring_commit(ring); - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); op_str = mes_v12_0_get_op_string(x_pkt); misc_op_str = mes_v12_0_get_misc_op_string(x_pkt); @@ -249,7 +249,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_ring_undo(ring); error_unlock_free: - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); error_wb_free: amdgpu_device_wb_free(adev, status_offset); @@ -1095,7 +1095,7 @@ static int mes_v12_0_kiq_enable_queue(struct amdgpu_device *adev) return r; } - kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring); + kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring[0]); r = amdgpu_ring_test_ring(kiq_ring); if (r) { @@ -1114,7 +1114,7 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1160,7 +1160,7 @@ static int mes_v12_0_ring_init(struct amdgpu_device *adev) { struct amdgpu_ring *ring; - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; ring->funcs = &mes_v12_0_ring_funcs; @@ -1213,7 +1213,7 @@ static int mes_v12_0_mqd_sw_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1302,12 +1302,12 @@ static int mes_v12_0_sw_fini(void *handle) &adev->gfx.kiq[0].ring.mqd_gpu_addr, &adev->gfx.kiq[0].ring.mqd_ptr); - amdgpu_bo_free_kernel(&adev->mes.ring.mqd_obj, - &adev->mes.ring.mqd_gpu_addr, - &adev->mes.ring.mqd_ptr); + amdgpu_bo_free_kernel(&adev->mes.ring[0].mqd_obj, + &adev->mes.ring[0].mqd_gpu_addr, + &adev->mes.ring[0].mqd_ptr); amdgpu_ring_fini(&adev->gfx.kiq[0].ring); - amdgpu_ring_fini(&adev->mes.ring); + amdgpu_ring_fini(&adev->mes.ring[0]); if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { mes_v12_0_free_ucode_buffers(adev, AMDGPU_MES_KIQ_PIPE); @@ -1351,7 +1351,7 @@ static void mes_v12_0_kiq_dequeue_sched(struct amdgpu_device *adev) soc21_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); - adev->mes.ring.sched.ready = false; + adev->mes.ring[0].sched.ready = false; } static void mes_v12_0_kiq_setting(struct amdgpu_ring *ring) @@ -1415,9 +1415,9 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev) { - if (adev->mes.ring.sched.ready) { + if (adev->mes.ring[0].sched.ready) { mes_v12_0_kiq_dequeue_sched(adev); - adev->mes.ring.sched.ready = false; + adev->mes.ring[0].sched.ready = false; } mes_v12_0_enable(adev, false); @@ -1430,7 +1430,7 @@ static int mes_v12_0_hw_init(void *handle) int r; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->mes.ring.sched.ready) + if (adev->mes.ring[0].sched.ready) goto out; if (!adev->enable_mes_kiq || adev->enable_uni_mes) { @@ -1482,7 +1482,7 @@ static int mes_v12_0_hw_init(void *handle) * with MES enabled. */ adev->gfx.kiq[0].ring.sched.ready = false; - adev->mes.ring.sched.ready = true; + adev->mes.ring[0].sched.ready = true; return 0; From a13d91bf3c1910212e45a69d04ad40d99878f8da Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 13:19:59 +0800 Subject: [PATCH 0835/1052] drm/amdgpu/mes12: load unified mes fw on pipe0 and pipe1 Enable unified mes firmware to load on pipe0 and pipe1. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit e69c2dd7534f3fcabf7bb801db2a7ac71e7e5da6) --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 27 +++---------------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 8ef53f48ce15..81bed8e8478d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1501,7 +1501,7 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe) amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix)); - if (adev->enable_uni_mes && pipe == AMDGPU_MES_SCHED_PIPE) { + if (adev->enable_uni_mes) { snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_uni_mes.bin", ucode_prefix); } else if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0) && diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 515de65d8aa0..28bb72d2cffd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -744,16 +744,11 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) if (enable) { data = RREG32_SOC15(GC, 0, regCP_MES_CNTL); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE0_RESET, 1); - data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, - (!adev->enable_uni_mes && adev->enable_mes_kiq) ? 1 : 0); + data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, 1); WREG32_SOC15(GC, 0, regCP_MES_CNTL, data); mutex_lock(&adev->srbm_mutex); for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if ((!adev->enable_mes_kiq || adev->enable_uni_mes) && - pipe == AMDGPU_MES_KIQ_PIPE) - continue; - soc21_grbm_select(adev, 3, pipe, 0, 0); ucode_addr = adev->mes.uc_start_addr[pipe] >> 2; @@ -767,8 +762,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) /* unhalt MES and activate pipe0 */ data = REG_SET_FIELD(0, CP_MES_CNTL, MES_PIPE0_ACTIVE, 1); - data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_ACTIVE, - (!adev->enable_uni_mes && adev->enable_mes_kiq) ? 1 : 0); + data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_ACTIVE, 1); WREG32_SOC15(GC, 0, regCP_MES_CNTL, data); if (amdgpu_emu_mode) @@ -784,8 +778,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) data = REG_SET_FIELD(data, CP_MES_CNTL, MES_INVALIDATE_ICACHE, 1); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE0_RESET, 1); - data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, - (!adev->enable_uni_mes && adev->enable_mes_kiq) ? 1 : 0); + data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, 1); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_HALT, 1); WREG32_SOC15(GC, 0, regCP_MES_CNTL, data); } @@ -800,10 +793,6 @@ static void mes_v12_0_set_ucode_start_addr(struct amdgpu_device *adev) mutex_lock(&adev->srbm_mutex); for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if ((!adev->enable_mes_kiq || adev->enable_uni_mes) && - pipe == AMDGPU_MES_KIQ_PIPE) - continue; - /* me=3, queue=0 */ soc21_grbm_select(adev, 3, pipe, 0, 0); @@ -1525,17 +1514,7 @@ static int mes_v12_0_early_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int pipe, r; - if (adev->enable_uni_mes) { - r = amdgpu_mes_init_microcode(adev, AMDGPU_MES_SCHED_PIPE); - if (!r) - return 0; - - adev->enable_uni_mes = false; - } - for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE) - continue; r = amdgpu_mes_init_microcode(adev, pipe); if (r) return r; From 3738a7f0ddb920bde538d3f78a02edbc6ad1307e Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 14:15:48 +0800 Subject: [PATCH 0836/1052] drm/amdgpu/mes12: add mes pipe switch support Add mes pipe switch to let caller choose pipe to submit packet. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit b2dee0837a4be63e8d3e00550a9f057644f962c4) --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 59 ++++++++++++++------------ 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 28bb72d2cffd..1213f35e2900 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -142,13 +142,14 @@ static const char *mes_v12_0_get_misc_op_string(union MESAPI__MISC *x_pkt) } static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, - void *pkt, int size, - int api_status_off) + int pipe, void *pkt, int size, + int api_status_off) { union MESAPI__QUERY_MES_STATUS mes_status_pkt; signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; - struct amdgpu_ring *ring = &mes->ring[0]; + struct amdgpu_ring *ring = &mes->ring[pipe]; + spinlock_t *ring_lock = &mes->ring_lock[pipe]; struct MES_API_STATUS *api_status; union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; @@ -177,7 +178,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, status_ptr = (u64 *)&adev->wb.wb[status_offset]; *status_ptr = 0; - spin_lock_irqsave(&mes->ring_lock[0], flags); + spin_lock_irqsave(ring_lock, flags); r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); if (r) goto error_unlock_free; @@ -207,32 +208,33 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, sizeof(mes_status_pkt) / 4); amdgpu_ring_commit(ring); - spin_unlock_irqrestore(&mes->ring_lock[0], flags); + spin_unlock_irqrestore(ring_lock, flags); op_str = mes_v12_0_get_op_string(x_pkt); misc_op_str = mes_v12_0_get_misc_op_string(x_pkt); if (misc_op_str) - dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, - misc_op_str); + dev_dbg(adev->dev, "MES(%d) msg=%s (%s) was emitted\n", + pipe, op_str, misc_op_str); else if (op_str) - dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str); + dev_dbg(adev->dev, "MES(%d) msg=%s was emitted\n", + pipe, op_str); else - dev_dbg(adev->dev, "MES msg=%d was emitted\n", - x_pkt->header.opcode); + dev_dbg(adev->dev, "MES(%d) msg=%d was emitted\n", + pipe, x_pkt->header.opcode); r = amdgpu_fence_wait_polling(ring, seq, timeout); if (r < 1 || !*status_ptr) { if (misc_op_str) - dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n", - op_str, misc_op_str); + dev_err(adev->dev, "MES(%d) failed to respond to msg=%s (%s)\n", + pipe, op_str, misc_op_str); else if (op_str) - dev_err(adev->dev, "MES failed to respond to msg=%s\n", - op_str); + dev_err(adev->dev, "MES(%d) failed to respond to msg=%s\n", + pipe, op_str); else - dev_err(adev->dev, "MES failed to respond to msg=%d\n", - x_pkt->header.opcode); + dev_err(adev->dev, "MES(%d) failed to respond to msg=%d\n", + pipe, x_pkt->header.opcode); while (halt_if_hws_hang) schedule(); @@ -249,7 +251,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_ring_undo(ring); error_unlock_free: - spin_unlock_irqrestore(&mes->ring_lock[0], flags); + spin_unlock_irqrestore(ring_lock, flags); error_wb_free: amdgpu_device_wb_free(adev, status_offset); @@ -321,6 +323,7 @@ static int mes_v12_0_add_hw_queue(struct amdgpu_mes *mes, mes_add_queue_pkt.gds_size = input->queue_size; return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_add_queue_pkt, sizeof(mes_add_queue_pkt), offsetof(union MESAPI__ADD_QUEUE, api_status)); } @@ -340,6 +343,7 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt), offsetof(union MESAPI__REMOVE_QUEUE, api_status)); } @@ -365,6 +369,7 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, mes_add_queue_pkt.map_legacy_kq = 1; return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_add_queue_pkt, sizeof(mes_add_queue_pkt), offsetof(union MESAPI__ADD_QUEUE, api_status)); } @@ -398,6 +403,7 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes, } return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt), offsetof(union MESAPI__REMOVE_QUEUE, api_status)); } @@ -414,7 +420,7 @@ static int mes_v12_0_resume_gang(struct amdgpu_mes *mes, return 0; } -static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes) +static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes, int pipe) { union MESAPI__QUERY_MES_STATUS mes_status_pkt; @@ -424,7 +430,7 @@ static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes) mes_status_pkt.header.opcode = MES_SCH_API_QUERY_SCHEDULER_STATUS; mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; - return mes_v12_0_submit_pkt_and_poll_completion(mes, + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_status_pkt, sizeof(mes_status_pkt), offsetof(union MESAPI__QUERY_MES_STATUS, api_status)); } @@ -486,11 +492,12 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, } return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &misc_pkt, sizeof(misc_pkt), offsetof(union MESAPI__MISC, api_status)); } -static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes) +static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes, int pipe) { union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_1_pkt; @@ -501,12 +508,12 @@ static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes) mes_set_hw_res_1_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_set_hw_res_1_pkt.mes_kiq_unmap_timeout = 100; - return mes_v12_0_submit_pkt_and_poll_completion(mes, + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_1_pkt, sizeof(mes_set_hw_res_1_pkt), offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status)); } -static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes) +static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) { int i; struct amdgpu_device *adev = mes->adev; @@ -566,7 +573,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; } - return mes_v12_0_submit_pkt_and_poll_completion(mes, + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); } @@ -1446,19 +1453,19 @@ static int mes_v12_0_hw_init(void *handle) if (r) goto failure; - r = mes_v12_0_set_hw_resources(&adev->mes); + r = mes_v12_0_set_hw_resources(&adev->mes, AMDGPU_MES_SCHED_PIPE); if (r) goto failure; if (adev->enable_uni_mes) - mes_v12_0_set_hw_resources_1(&adev->mes); + mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_SCHED_PIPE); mes_v12_0_init_aggregated_doorbell(&adev->mes); /* Enable the MES to handle doorbell ring on unmapped queue */ mes_v12_0_enable_unmapped_doorbell_handling(&adev->mes, true); - r = mes_v12_0_query_sched_status(&adev->mes); + r = mes_v12_0_query_sched_status(&adev->mes, AMDGPU_MES_SCHED_PIPE); if (r) { DRM_ERROR("MES is busy\n"); goto failure; From 1097727d6d0c13eca25321fff46714fc5047d6e8 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 14:44:07 +0800 Subject: [PATCH 0837/1052] drm/amdgpu/mes12: adjust mes12 sw/hw init for multiple pipes Adjust mes12 sw/hw initiailization for both pipe0 and pipe1 enablement. The two pipes are almost identical pipe. Pipe0 behaves like schq and pipe1 like kiq, pipe0 was mapped by pipe1. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit aa539da8aff07ab08def6490e8c9b441439e70ba) --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 106 +++++++++++++++---------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 1213f35e2900..d18ec5855193 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -266,6 +266,8 @@ static int convert_to_mes_queue_type(int queue_type) return MES_QUEUE_TYPE_COMPUTE; else if (queue_type == AMDGPU_RING_TYPE_SDMA) return MES_QUEUE_TYPE_SDMA; + else if (queue_type == AMDGPU_RING_TYPE_MES) + return MES_QUEUE_TYPE_SCHQ; else BUG(); return -1; @@ -352,6 +354,7 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, struct mes_map_legacy_queue_input *input) { union MESAPI__ADD_QUEUE mes_add_queue_pkt; + int pipe; memset(&mes_add_queue_pkt, 0, sizeof(mes_add_queue_pkt)); @@ -368,8 +371,12 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, convert_to_mes_queue_type(input->queue_type); mes_add_queue_pkt.map_legacy_kq = 1; - return mes_v12_0_submit_pkt_and_poll_completion(mes, - AMDGPU_MES_SCHED_PIPE, + if (mes->adev->enable_uni_mes) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_add_queue_pkt, sizeof(mes_add_queue_pkt), offsetof(union MESAPI__ADD_QUEUE, api_status)); } @@ -378,6 +385,7 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes, struct mes_unmap_legacy_queue_input *input) { union MESAPI__REMOVE_QUEUE mes_remove_queue_pkt; + int pipe; memset(&mes_remove_queue_pkt, 0, sizeof(mes_remove_queue_pkt)); @@ -402,8 +410,12 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes, convert_to_mes_queue_type(input->queue_type); } - return mes_v12_0_submit_pkt_and_poll_completion(mes, - AMDGPU_MES_SCHED_PIPE, + if (mes->adev->enable_uni_mes) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt), offsetof(union MESAPI__REMOVE_QUEUE, api_status)); } @@ -439,6 +451,7 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, struct mes_misc_op_input *input) { union MESAPI__MISC misc_pkt; + int pipe; memset(&misc_pkt, 0, sizeof(misc_pkt)); @@ -491,8 +504,12 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, return -EINVAL; } - return mes_v12_0_submit_pkt_and_poll_completion(mes, - AMDGPU_MES_SCHED_PIPE, + if (mes->adev->enable_uni_mes) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &misc_pkt, sizeof(misc_pkt), offsetof(union MESAPI__MISC, api_status)); } @@ -1107,14 +1124,12 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, struct amdgpu_ring *ring; int r; - if (pipe == AMDGPU_MES_KIQ_PIPE) + if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; - else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring[0]; else - BUG(); + ring = &adev->mes.ring[pipe]; - if ((pipe == AMDGPU_MES_SCHED_PIPE) && + if ((adev->enable_uni_mes || pipe == AMDGPU_MES_SCHED_PIPE) && (amdgpu_in_reset(adev) || adev->in_suspend)) { *(ring->wptr_cpu_addr) = 0; *(ring->rptr_cpu_addr) = 0; @@ -1126,13 +1141,12 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, return r; if (pipe == AMDGPU_MES_SCHED_PIPE) { - if (adev->enable_uni_mes) { - mes_v12_0_queue_init_register(ring); - } else { + if (adev->enable_uni_mes) + r = amdgpu_mes_map_legacy_queue(adev, ring); + else r = mes_v12_0_kiq_enable_queue(adev); - if (r) - return r; - } + if (r) + return r; } else { mes_v12_0_queue_init_register(ring); } @@ -1152,25 +1166,29 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, return 0; } -static int mes_v12_0_ring_init(struct amdgpu_device *adev) +static int mes_v12_0_ring_init(struct amdgpu_device *adev, int pipe) { struct amdgpu_ring *ring; - ring = &adev->mes.ring[0]; + ring = &adev->mes.ring[pipe]; ring->funcs = &mes_v12_0_ring_funcs; ring->me = 3; - ring->pipe = 0; + ring->pipe = pipe; ring->queue = 0; ring->ring_obj = NULL; ring->use_doorbell = true; - ring->doorbell_index = adev->doorbell_index.mes_ring0 << 1; - ring->eop_gpu_addr = adev->mes.eop_gpu_addr[AMDGPU_MES_SCHED_PIPE]; + ring->eop_gpu_addr = adev->mes.eop_gpu_addr[pipe]; ring->no_scheduler = true; sprintf(ring->name, "mes_%d.%d.%d", ring->me, ring->pipe, ring->queue); + if (pipe == AMDGPU_MES_SCHED_PIPE) + ring->doorbell_index = adev->doorbell_index.mes_ring0 << 1; + else + ring->doorbell_index = adev->doorbell_index.mes_ring1 << 1; + return amdgpu_ring_init(adev, ring, 1024, NULL, 0, AMDGPU_RING_PRIO_DEFAULT, NULL); } @@ -1184,7 +1202,7 @@ static int mes_v12_0_kiq_ring_init(struct amdgpu_device *adev) ring = &adev->gfx.kiq[0].ring; ring->me = 3; - ring->pipe = adev->enable_uni_mes ? 0 : 1; + ring->pipe = 1; ring->queue = 0; ring->adev = NULL; @@ -1206,12 +1224,10 @@ static int mes_v12_0_mqd_sw_init(struct amdgpu_device *adev, int r, mqd_size = sizeof(struct v12_compute_mqd); struct amdgpu_ring *ring; - if (pipe == AMDGPU_MES_KIQ_PIPE) + if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; - else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring[0]; else - BUG(); + ring = &adev->mes.ring[pipe]; if (ring->mqd_obj) return 0; @@ -1252,9 +1268,6 @@ static int mes_v12_0_sw_init(void *handle) return r; for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE) - continue; - r = mes_v12_0_allocate_eop_buf(adev, pipe); if (r) return r; @@ -1262,18 +1275,15 @@ static int mes_v12_0_sw_init(void *handle) r = mes_v12_0_mqd_sw_init(adev, pipe); if (r) return r; - } - if (adev->enable_mes_kiq) { - r = mes_v12_0_kiq_ring_init(adev); + if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) + r = mes_v12_0_kiq_ring_init(adev); + else + r = mes_v12_0_ring_init(adev, pipe); if (r) return r; } - r = mes_v12_0_ring_init(adev); - if (r) - return r; - return 0; } @@ -1368,10 +1378,10 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) { int r = 0; - mes_v12_0_kiq_setting(&adev->gfx.kiq[0].ring); - if (adev->enable_uni_mes) - return mes_v12_0_hw_init(adev); + mes_v12_0_kiq_setting(&adev->mes.ring[AMDGPU_MES_KIQ_PIPE]); + else + mes_v12_0_kiq_setting(&adev->gfx.kiq[0].ring); if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { @@ -1398,6 +1408,14 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) if (r) goto failure; + if (adev->enable_uni_mes) { + r = mes_v12_0_set_hw_resources(&adev->mes, AMDGPU_MES_KIQ_PIPE); + if (r) + goto failure; + + mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_KIQ_PIPE); + } + r = mes_v12_0_hw_init(adev); if (r) goto failure; @@ -1429,7 +1447,7 @@ static int mes_v12_0_hw_init(void *handle) if (adev->mes.ring[0].sched.ready) goto out; - if (!adev->enable_mes_kiq || adev->enable_uni_mes) { + if (!adev->enable_mes_kiq) { if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { r = mes_v12_0_load_microcode(adev, AMDGPU_MES_SCHED_PIPE, true); @@ -1449,6 +1467,9 @@ static int mes_v12_0_hw_init(void *handle) mes_v12_0_enable(adev, true); } + /* Enable the MES to handle doorbell ring on unmapped queue */ + mes_v12_0_enable_unmapped_doorbell_handling(&adev->mes, true); + r = mes_v12_0_queue_init(adev, AMDGPU_MES_SCHED_PIPE); if (r) goto failure; @@ -1462,9 +1483,6 @@ static int mes_v12_0_hw_init(void *handle) mes_v12_0_init_aggregated_doorbell(&adev->mes); - /* Enable the MES to handle doorbell ring on unmapped queue */ - mes_v12_0_enable_unmapped_doorbell_handling(&adev->mes, true); - r = mes_v12_0_query_sched_status(&adev->mes, AMDGPU_MES_SCHED_PIPE); if (r) { DRM_ERROR("MES is busy\n"); From 7254027e1e6edbff54f5930a5f13f14ac6f1694c Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 14:49:30 +0800 Subject: [PATCH 0838/1052] drm/amdgpu/mes12: configure two pipes hardware resources Configure two pipes with different hardware resources. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit ea5d6db17a8e3635ad91e8c53faa1fdc9570fbbb) --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 77 +++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 12 ++-- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 7 +-- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 49 ++++++++-------- 4 files changed, 81 insertions(+), 64 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 81bed8e8478d..1cb1ec7beefe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -165,36 +165,38 @@ int amdgpu_mes_init(struct amdgpu_device *adev) adev->mes.sdma_hqd_mask[i] = 0xfc; } - r = amdgpu_device_wb_get(adev, &adev->mes.sch_ctx_offs); - if (r) { - dev_err(adev->dev, - "(%d) ring trail_fence_offs wb alloc failed\n", r); - goto error_ids; - } - adev->mes.sch_ctx_gpu_addr = - adev->wb.gpu_addr + (adev->mes.sch_ctx_offs * 4); - adev->mes.sch_ctx_ptr = - (uint64_t *)&adev->wb.wb[adev->mes.sch_ctx_offs]; + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) { + r = amdgpu_device_wb_get(adev, &adev->mes.sch_ctx_offs[i]); + if (r) { + dev_err(adev->dev, + "(%d) ring trail_fence_offs wb alloc failed\n", + r); + goto error; + } + adev->mes.sch_ctx_gpu_addr[i] = + adev->wb.gpu_addr + (adev->mes.sch_ctx_offs[i] * 4); + adev->mes.sch_ctx_ptr[i] = + (uint64_t *)&adev->wb.wb[adev->mes.sch_ctx_offs[i]]; - r = amdgpu_device_wb_get(adev, &adev->mes.query_status_fence_offs); - if (r) { - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - dev_err(adev->dev, - "(%d) query_status_fence_offs wb alloc failed\n", r); - goto error_ids; + r = amdgpu_device_wb_get(adev, + &adev->mes.query_status_fence_offs[i]); + if (r) { + dev_err(adev->dev, + "(%d) query_status_fence_offs wb alloc failed\n", + r); + goto error; + } + adev->mes.query_status_fence_gpu_addr[i] = adev->wb.gpu_addr + + (adev->mes.query_status_fence_offs[i] * 4); + adev->mes.query_status_fence_ptr[i] = + (uint64_t *)&adev->wb.wb[adev->mes.query_status_fence_offs[i]]; } - adev->mes.query_status_fence_gpu_addr = - adev->wb.gpu_addr + (adev->mes.query_status_fence_offs * 4); - adev->mes.query_status_fence_ptr = - (uint64_t *)&adev->wb.wb[adev->mes.query_status_fence_offs]; r = amdgpu_device_wb_get(adev, &adev->mes.read_val_offs); if (r) { - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); dev_err(adev->dev, "(%d) read_val_offs alloc failed\n", r); - goto error_ids; + goto error; } adev->mes.read_val_gpu_addr = adev->wb.gpu_addr + (adev->mes.read_val_offs * 4); @@ -214,10 +216,16 @@ int amdgpu_mes_init(struct amdgpu_device *adev) error_doorbell: amdgpu_mes_doorbell_free(adev); error: - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); -error_ids: + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) { + if (adev->mes.sch_ctx_ptr[i]) + amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]); + if (adev->mes.query_status_fence_ptr[i]) + amdgpu_device_wb_free(adev, + adev->mes.query_status_fence_offs[i]); + } + if (adev->mes.read_val_ptr) + amdgpu_device_wb_free(adev, adev->mes.read_val_offs); + idr_destroy(&adev->mes.pasid_idr); idr_destroy(&adev->mes.gang_id_idr); idr_destroy(&adev->mes.queue_id_idr); @@ -228,13 +236,22 @@ int amdgpu_mes_init(struct amdgpu_device *adev) void amdgpu_mes_fini(struct amdgpu_device *adev) { + int i; + amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, &adev->mes.event_log_cpu_addr); - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) { + if (adev->mes.sch_ctx_ptr[i]) + amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]); + if (adev->mes.query_status_fence_ptr[i]) + amdgpu_device_wb_free(adev, + adev->mes.query_status_fence_offs[i]); + } + if (adev->mes.read_val_ptr) + amdgpu_device_wb_free(adev, adev->mes.read_val_offs); + amdgpu_mes_doorbell_free(adev); idr_destroy(&adev->mes.pasid_idr); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index f89e3f61fe46..0bc837dab578 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -112,12 +112,12 @@ struct amdgpu_mes { uint32_t gfx_hqd_mask[AMDGPU_MES_MAX_GFX_PIPES]; uint32_t sdma_hqd_mask[AMDGPU_MES_MAX_SDMA_PIPES]; uint32_t aggregated_doorbells[AMDGPU_MES_PRIORITY_NUM_LEVELS]; - uint32_t sch_ctx_offs; - uint64_t sch_ctx_gpu_addr; - uint64_t *sch_ctx_ptr; - uint32_t query_status_fence_offs; - uint64_t query_status_fence_gpu_addr; - uint64_t *query_status_fence_ptr; + uint32_t sch_ctx_offs[AMDGPU_MAX_MES_PIPES]; + uint64_t sch_ctx_gpu_addr[AMDGPU_MAX_MES_PIPES]; + uint64_t *sch_ctx_ptr[AMDGPU_MAX_MES_PIPES]; + uint32_t query_status_fence_offs[AMDGPU_MAX_MES_PIPES]; + uint64_t query_status_fence_gpu_addr[AMDGPU_MAX_MES_PIPES]; + uint64_t *query_status_fence_ptr[AMDGPU_MAX_MES_PIPES]; uint32_t read_val_offs; uint64_t read_val_gpu_addr; uint32_t *read_val_ptr; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 44bdfa0b263a..2ea8223eb969 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -522,9 +522,9 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.vmid_mask_gfxhub = mes->vmid_mask_gfxhub; mes_set_hw_res_pkt.gds_size = adev->gds.gds_size; mes_set_hw_res_pkt.paging_vmid = 0; - mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = mes->sch_ctx_gpu_addr; + mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = mes->sch_ctx_gpu_addr[0]; mes_set_hw_res_pkt.query_status_fence_gpu_mc_ptr = - mes->query_status_fence_gpu_addr; + mes->query_status_fence_gpu_addr[0]; for (i = 0; i < MAX_COMPUTE_PIPES; i++) mes_set_hw_res_pkt.compute_hqd_mask[i] = @@ -1210,9 +1210,6 @@ static int mes_v11_0_sw_fini(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int pipe; - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { kfree(adev->mes.mqd_backup[pipe]); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index d18ec5855193..ed77d70441cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -542,27 +542,33 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC; mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; - mes_set_hw_res_pkt.vmid_mask_mmhub = mes->vmid_mask_mmhub; - mes_set_hw_res_pkt.vmid_mask_gfxhub = mes->vmid_mask_gfxhub; - mes_set_hw_res_pkt.gds_size = adev->gds.gds_size; - mes_set_hw_res_pkt.paging_vmid = 0; - mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = mes->sch_ctx_gpu_addr; + if (pipe == AMDGPU_MES_SCHED_PIPE) { + mes_set_hw_res_pkt.vmid_mask_mmhub = mes->vmid_mask_mmhub; + mes_set_hw_res_pkt.vmid_mask_gfxhub = mes->vmid_mask_gfxhub; + mes_set_hw_res_pkt.gds_size = adev->gds.gds_size; + mes_set_hw_res_pkt.paging_vmid = 0; + + for (i = 0; i < MAX_COMPUTE_PIPES; i++) + mes_set_hw_res_pkt.compute_hqd_mask[i] = + mes->compute_hqd_mask[i]; + + for (i = 0; i < MAX_GFX_PIPES; i++) + mes_set_hw_res_pkt.gfx_hqd_mask[i] = + mes->gfx_hqd_mask[i]; + + for (i = 0; i < MAX_SDMA_PIPES; i++) + mes_set_hw_res_pkt.sdma_hqd_mask[i] = + mes->sdma_hqd_mask[i]; + + for (i = 0; i < AMD_PRIORITY_NUM_LEVELS; i++) + mes_set_hw_res_pkt.aggregated_doorbells[i] = + mes->aggregated_doorbells[i]; + } + + mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = + mes->sch_ctx_gpu_addr[pipe]; mes_set_hw_res_pkt.query_status_fence_gpu_mc_ptr = - mes->query_status_fence_gpu_addr; - - for (i = 0; i < MAX_COMPUTE_PIPES; i++) - mes_set_hw_res_pkt.compute_hqd_mask[i] = - mes->compute_hqd_mask[i]; - - for (i = 0; i < MAX_GFX_PIPES; i++) - mes_set_hw_res_pkt.gfx_hqd_mask[i] = mes->gfx_hqd_mask[i]; - - for (i = 0; i < MAX_SDMA_PIPES; i++) - mes_set_hw_res_pkt.sdma_hqd_mask[i] = mes->sdma_hqd_mask[i]; - - for (i = 0; i < AMD_PRIORITY_NUM_LEVELS; i++) - mes_set_hw_res_pkt.aggregated_doorbells[i] = - mes->aggregated_doorbells[i]; + mes->query_status_fence_gpu_addr[pipe]; for (i = 0; i < 5; i++) { mes_set_hw_res_pkt.gc_base[i] = adev->reg_offset[GC_HWIP][0][i]; @@ -1292,9 +1298,6 @@ static int mes_v12_0_sw_fini(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int pipe; - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { kfree(adev->mes.mqd_backup[pipe]); From af401543df510a73f7beb13f80cf4c541be94786 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 15:23:16 +0800 Subject: [PATCH 0839/1052] drm/amdgpu/mes12: sw/hw fini for unified mes Free memory for two pipes and unmap pipe0 via pipe1. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 98cae695a8ae0e4291b1fa7feef9b54fabefe885) --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 31 +++++++++++++++++--------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index ed77d70441cf..e39a58d262c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1305,18 +1305,21 @@ static int mes_v12_0_sw_fini(void *handle) &adev->mes.eop_gpu_addr[pipe], NULL); amdgpu_ucode_release(&adev->mes.fw[pipe]); + + if (adev->enable_uni_mes || pipe == AMDGPU_MES_SCHED_PIPE) { + amdgpu_bo_free_kernel(&adev->mes.ring[pipe].mqd_obj, + &adev->mes.ring[pipe].mqd_gpu_addr, + &adev->mes.ring[pipe].mqd_ptr); + amdgpu_ring_fini(&adev->mes.ring[pipe]); + } } - amdgpu_bo_free_kernel(&adev->gfx.kiq[0].ring.mqd_obj, - &adev->gfx.kiq[0].ring.mqd_gpu_addr, - &adev->gfx.kiq[0].ring.mqd_ptr); - - amdgpu_bo_free_kernel(&adev->mes.ring[0].mqd_obj, - &adev->mes.ring[0].mqd_gpu_addr, - &adev->mes.ring[0].mqd_ptr); - - amdgpu_ring_fini(&adev->gfx.kiq[0].ring); - amdgpu_ring_fini(&adev->mes.ring[0]); + if (!adev->enable_uni_mes) { + amdgpu_bo_free_kernel(&adev->gfx.kiq[0].ring.mqd_obj, + &adev->gfx.kiq[0].ring.mqd_gpu_addr, + &adev->gfx.kiq[0].ring.mqd_ptr); + amdgpu_ring_fini(&adev->gfx.kiq[0].ring); + } if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { mes_v12_0_free_ucode_buffers(adev, AMDGPU_MES_KIQ_PIPE); @@ -1433,7 +1436,13 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev) { if (adev->mes.ring[0].sched.ready) { - mes_v12_0_kiq_dequeue_sched(adev); + if (adev->enable_uni_mes) + amdgpu_mes_unmap_legacy_queue(adev, + &adev->mes.ring[AMDGPU_MES_SCHED_PIPE], + RESET_QUEUES, 0, 0); + else + mes_v12_0_kiq_dequeue_sched(adev); + adev->mes.ring[0].sched.ready = false; } From 4246b1077ffcc37926868581bb818fdb49d0d065 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 12:03:11 +0800 Subject: [PATCH 0840/1052] drm/amdgpu/mes12: fix suspend issue Use mes pipe to unmap kcq and kgq. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit f7fb9d677faf0460131bc2af15afd766d48a1f47) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 22 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 27 +------------------------ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index f165b9d49e29..c770cb201e64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -509,6 +509,16 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) int i, r = 0; int j; + if (adev->enable_mes) { + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + j = i + xcc_id * adev->gfx.num_compute_rings; + amdgpu_mes_unmap_legacy_queue(adev, + &adev->gfx.compute_ring[j], + RESET_QUEUES, 0, 0); + } + return 0; + } + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; @@ -551,6 +561,18 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) int i, r = 0; int j; + if (adev->enable_mes) { + if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { + for (i = 0; i < adev->gfx.num_gfx_rings; i++) { + j = i + xcc_id * adev->gfx.num_gfx_rings; + amdgpu_mes_unmap_legacy_queue(adev, + &adev->gfx.gfx_ring[j], + PREEMPT_QUEUES, 0, 0); + } + } + return 0; + } + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 506fa8003388..2c611b8577a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -3546,33 +3546,9 @@ static int gfx_v12_0_hw_init(void *handle) return r; } -static int gfx_v12_0_kiq_disable_kgq(struct amdgpu_device *adev) -{ - struct amdgpu_kiq *kiq = &adev->gfx.kiq[0]; - struct amdgpu_ring *kiq_ring = &kiq->ring; - int i, r = 0; - - if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) - return -EINVAL; - - if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * - adev->gfx.num_gfx_rings)) - return -ENOMEM; - - for (i = 0; i < adev->gfx.num_gfx_rings; i++) - kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i], - PREEMPT_QUEUES, 0, 0); - - if (adev->gfx.kiq[0].ring.sched.ready) - r = amdgpu_ring_test_helper(kiq_ring); - - return r; -} - static int gfx_v12_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - int r; uint32_t tmp; amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); @@ -3580,8 +3556,7 @@ static int gfx_v12_0_hw_fini(void *handle) if (!adev->no_hw_access) { if (amdgpu_async_gfx_ring) { - r = gfx_v12_0_kiq_disable_kgq(adev); - if (r) + if (amdgpu_gfx_disable_kgq(adev, 0)) DRM_ERROR("KGQ disable failed\n"); } From 470516c2925493594a690bc4d05b1f4471d9f996 Mon Sep 17 00:00:00 2001 From: "David (Ming Qiang) Wu" Date: Thu, 8 Aug 2024 12:19:50 -0400 Subject: [PATCH 0841/1052] drm/amd/amdgpu: command submission parser for JPEG Add JPEG IB command parser to ensure registers in the command are within the JPEG IP block. Reviewed-by: Alex Deucher Signed-off-by: David (Ming Qiang) Wu Signed-off-by: Alex Deucher (cherry picked from commit a7f670d5d8e77b092404ca8a35bb0f8f89ed3117) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ++ drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 61 +++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h | 7 ++- drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c | 1 + drivers/gpu/drm/amd/amdgpu/soc15d.h | 6 +++ 5 files changed, 76 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 9aa952f258cf..6dfdff58bffd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1057,6 +1057,9 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, r = amdgpu_ring_parse_cs(ring, p, job, ib); if (r) return r; + + if (ib->sa_bo) + ib->gpu_addr = amdgpu_sa_bo_gpu_addr(ib->sa_bo); } else { ib->ptr = (uint32_t *)kptr; r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib); diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index f4662920c653..6ae5a784e187 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -23,6 +23,7 @@ #include "amdgpu.h" #include "amdgpu_jpeg.h" +#include "amdgpu_cs.h" #include "soc15.h" #include "soc15d.h" #include "jpeg_v4_0_3.h" @@ -782,7 +783,11 @@ void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring, amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JRBC_IB_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); + + if (ring->funcs->parse_cs) + amdgpu_ring_write(ring, 0); + else + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JPEG_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); @@ -1084,6 +1089,7 @@ static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = { .get_rptr = jpeg_v4_0_3_dec_ring_get_rptr, .get_wptr = jpeg_v4_0_3_dec_ring_get_wptr, .set_wptr = jpeg_v4_0_3_dec_ring_set_wptr, + .parse_cs = jpeg_v4_0_3_dec_ring_parse_cs, .emit_frame_size = SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 + @@ -1248,3 +1254,56 @@ static void jpeg_v4_0_3_set_ras_funcs(struct amdgpu_device *adev) { adev->jpeg.ras = &jpeg_v4_0_3_ras; } + +/** + * jpeg_v4_0_3_dec_ring_parse_cs - command submission parser + * + * @parser: Command submission parser context + * @job: the job to parse + * @ib: the IB to parse + * + * Parse the command stream, return -EINVAL for invalid packet, + * 0 otherwise + */ +int jpeg_v4_0_3_dec_ring_parse_cs(struct amdgpu_cs_parser *parser, + struct amdgpu_job *job, + struct amdgpu_ib *ib) +{ + uint32_t i, reg, res, cond, type; + struct amdgpu_device *adev = parser->adev; + + for (i = 0; i < ib->length_dw ; i += 2) { + reg = CP_PACKETJ_GET_REG(ib->ptr[i]); + res = CP_PACKETJ_GET_RES(ib->ptr[i]); + cond = CP_PACKETJ_GET_COND(ib->ptr[i]); + type = CP_PACKETJ_GET_TYPE(ib->ptr[i]); + + if (res) /* only support 0 at the moment */ + return -EINVAL; + + switch (type) { + case PACKETJ_TYPE0: + if (cond != PACKETJ_CONDITION_CHECK0 || reg < JPEG_REG_RANGE_START || reg > JPEG_REG_RANGE_END) { + dev_err(adev->dev, "Invalid packet [0x%08x]!\n", ib->ptr[i]); + return -EINVAL; + } + break; + case PACKETJ_TYPE3: + if (cond != PACKETJ_CONDITION_CHECK3 || reg < JPEG_REG_RANGE_START || reg > JPEG_REG_RANGE_END) { + dev_err(adev->dev, "Invalid packet [0x%08x]!\n", ib->ptr[i]); + return -EINVAL; + } + break; + case PACKETJ_TYPE6: + if (ib->ptr[i] == CP_PACKETJ_NOP) + continue; + dev_err(adev->dev, "Invalid packet [0x%08x]!\n", ib->ptr[i]); + return -EINVAL; + default: + dev_err(adev->dev, "Unknown packet type %d !\n", type); + return -EINVAL; + } + } + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h index 747a3e5f6856..71c54b294e15 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h @@ -46,6 +46,9 @@ #define JRBC_DEC_EXTERNAL_REG_WRITE_ADDR 0x18000 +#define JPEG_REG_RANGE_START 0x4000 +#define JPEG_REG_RANGE_END 0x41c2 + extern const struct amdgpu_ip_block_version jpeg_v4_0_3_ip_block; void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring, @@ -62,5 +65,7 @@ void jpeg_v4_0_3_dec_ring_insert_end(struct amdgpu_ring *ring); void jpeg_v4_0_3_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val); void jpeg_v4_0_3_dec_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg, uint32_t val, uint32_t mask); - +int jpeg_v4_0_3_dec_ring_parse_cs(struct amdgpu_cs_parser *parser, + struct amdgpu_job *job, + struct amdgpu_ib *ib); #endif /* __JPEG_V4_0_3_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c index d694a276498a..f4daff90c770 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c @@ -646,6 +646,7 @@ static const struct amdgpu_ring_funcs jpeg_v5_0_0_dec_ring_vm_funcs = { .get_rptr = jpeg_v5_0_0_dec_ring_get_rptr, .get_wptr = jpeg_v5_0_0_dec_ring_get_wptr, .set_wptr = jpeg_v5_0_0_dec_ring_set_wptr, + .parse_cs = jpeg_v4_0_3_dec_ring_parse_cs, .emit_frame_size = SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 + diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h index 2357ff39323f..e74e1983da53 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h @@ -76,6 +76,12 @@ ((cond & 0xF) << 24) | \ ((type & 0xF) << 28)) +#define CP_PACKETJ_NOP 0x60000000 +#define CP_PACKETJ_GET_REG(x) ((x) & 0x3FFFF) +#define CP_PACKETJ_GET_RES(x) (((x) >> 18) & 0x3F) +#define CP_PACKETJ_GET_COND(x) (((x) >> 24) & 0xF) +#define CP_PACKETJ_GET_TYPE(x) (((x) >> 28) & 0xF) + /* Packet 3 types */ #define PACKET3_NOP 0x10 #define PACKET3_SET_BASE 0x11 From 507a2286c052919fe416b3daa0f0061d0fc702b9 Mon Sep 17 00:00:00 2001 From: Yinjie Yao Date: Fri, 9 Aug 2024 17:20:26 -0400 Subject: [PATCH 0842/1052] drm/amdgpu: Update kmd_fw_shared for VCN5 kmd_fw_shared changed in VCN5 Signed-off-by: Yinjie Yao Reviewed-by: Ruijing Dong Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit aa02486fb18cecbaca0c4fd393d1a03f1d4c3f9a) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 1a5439abd1a0..c87d68d4be53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -461,8 +461,11 @@ struct amdgpu_vcn5_fw_shared { struct amdgpu_fw_shared_unified_queue_struct sq; uint8_t pad1[8]; struct amdgpu_fw_shared_fw_logging fw_log; + uint8_t pad2[20]; struct amdgpu_fw_shared_rb_setup rb_setup; - uint8_t pad2[4]; + struct amdgpu_fw_shared_smu_interface_info smu_dpm_interface; + struct amdgpu_fw_shared_drm_key_wa drm_key_wa; + uint8_t pad3[9]; }; #define VCN_BLOCK_ENCODE_DISABLE_MASK 0x80 From 23acd1f344e8102f803119d0c8fc4df4628d694f Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Thu, 8 Aug 2024 12:19:22 +0800 Subject: [PATCH 0843/1052] drm/amd/amdgpu: add HDP_SD support on gc 12.0.0/1 add HDP_SD support on gc 12.0.0/1 Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 61cffacb3a1c590b15c0e9ff987de02d293e0dd8) --- drivers/gpu/drm/amd/amdgpu/soc24.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c index 7d641d0dadba..b0c3678cfb31 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc24.c +++ b/drivers/gpu/drm/amd/amdgpu/soc24.c @@ -406,6 +406,7 @@ static int soc24_common_early_init(void *handle) AMD_CG_SUPPORT_ATHUB_MGCG | AMD_CG_SUPPORT_ATHUB_LS | AMD_CG_SUPPORT_MC_MGCG | + AMD_CG_SUPPORT_HDP_SD | AMD_CG_SUPPORT_MC_LS; adev->pg_flags = AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_JPEG | @@ -424,6 +425,7 @@ static int soc24_common_early_init(void *handle) AMD_CG_SUPPORT_ATHUB_MGCG | AMD_CG_SUPPORT_ATHUB_LS | AMD_CG_SUPPORT_MC_MGCG | + AMD_CG_SUPPORT_HDP_SD | AMD_CG_SUPPORT_MC_LS; adev->pg_flags = AMD_PG_SUPPORT_VCN | From 100bff23818eb61751ed05d64a7df36ce9728a4d Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Tue, 13 Aug 2024 15:17:27 +0000 Subject: [PATCH 0844/1052] perf/bpf: Don't call bpf_overflow_handler() for tracing events The regressing commit is new in 6.10. It assumed that anytime event->prog is set bpf_overflow_handler() should be invoked to execute the attached bpf program. This assumption is false for tracing events, and as a result the regressing commit broke bpftrace by invoking the bpf handler with garbage inputs on overflow. Prior to the regression the overflow handlers formed a chain (of length 0, 1, or 2) and perf_event_set_bpf_handler() (the !tracing case) added bpf_overflow_handler() to that chain, while perf_event_attach_bpf_prog() (the tracing case) did not. Both set event->prog. The chain of overflow handlers was replaced by a single overflow handler slot and a fixed call to bpf_overflow_handler() when appropriate. This modifies the condition there to check event->prog->type == BPF_PROG_TYPE_PERF_EVENT, restoring the previous behavior and fixing bpftrace. Signed-off-by: Kyle Huey Suggested-by: Andrii Nakryiko Reported-by: Joe Damato Closes: https://lore.kernel.org/lkml/ZpFfocvyF3KHaSzF@LQ3V64L9R2/ Fixes: f11f10bfa1ca ("perf/bpf: Call BPF handler directly, not through overflow machinery") Cc: stable@vger.kernel.org Tested-by: Joe Damato # bpftrace Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240813151727.28797-1-jdamato@fastly.com Signed-off-by: Alexei Starovoitov --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index aa3450bdc227..c973e3c11e03 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9706,7 +9706,8 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); - if (event->prog && !bpf_overflow_handler(event, data, regs)) + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && + !bpf_overflow_handler(event, data, regs)) return ret; /* From faada2174c08662ae98b439c69efe3e79382c538 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 13 Aug 2024 16:35:14 +0200 Subject: [PATCH 0845/1052] dm persistent data: fix memory allocation failure kmalloc is unreliable when allocating more than 8 pages of memory. It may fail when there is plenty of free memory but the memory is fragmented. Zdenek Kabelac observed such failure in his tests. This commit changes kmalloc to kvmalloc - kvmalloc will fall back to vmalloc if the large allocation fails. Signed-off-by: Mikulas Patocka Reported-by: Zdenek Kabelac Reviewed-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/persistent-data/dm-space-map-metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 04698fd03e60..d48c4fafc779 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -277,7 +277,7 @@ static void sm_metadata_destroy(struct dm_space_map *sm) { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - kfree(smm); + kvfree(smm); } static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) @@ -772,7 +772,7 @@ struct dm_space_map *dm_sm_metadata_init(void) { struct sm_metadata *smm; - smm = kmalloc(sizeof(*smm), GFP_KERNEL); + smm = kvmalloc(sizeof(*smm), GFP_KERNEL); if (!smm) return ERR_PTR(-ENOMEM); From f50733b45d865f91db90919f8311e2127ce5a0cb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 8 Aug 2024 11:39:08 -0700 Subject: [PATCH 0846/1052] exec: Fix ToCToU between perm check and set-uid/gid usage When opening a file for exec via do_filp_open(), permission checking is done against the file's metadata at that moment, and on success, a file pointer is passed back. Much later in the execve() code path, the file metadata (specifically mode, uid, and gid) is used to determine if/how to set the uid and gid. However, those values may have changed since the permissions check, meaning the execution may gain unintended privileges. For example, if a file could change permissions from executable and not set-id: ---------x 1 root root 16048 Aug 7 13:16 target to set-id and non-executable: ---S------ 1 root root 16048 Aug 7 13:16 target it is possible to gain root privileges when execution should have been disallowed. While this race condition is rare in real-world scenarios, it has been observed (and proven exploitable) when package managers are updating the setuid bits of installed programs. Such files start with being world-executable but then are adjusted to be group-exec with a set-uid bit. For example, "chmod o-x,u+s target" makes "target" executable only by uid "root" and gid "cdrom", while also becoming setuid-root: -rwxr-xr-x 1 root cdrom 16048 Aug 7 13:16 target becomes: -rwsr-xr-- 1 root cdrom 16048 Aug 7 13:16 target But racing the chmod means users without group "cdrom" membership can get the permission to execute "target" just before the chmod, and when the chmod finishes, the exec reaches brpm_fill_uid(), and performs the setuid to root, violating the expressed authorization of "only cdrom group members can setuid to root". Re-check that we still have execute permissions in case the metadata has changed. It would be better to keep a copy from the perm-check time, but until we can do that refactoring, the least-bad option is to do a full inode_permission() call (under inode lock). It is understood that this is safe against dead-locks, but hardly optimal. Reported-by: Marco Vanotti Tested-by: Marco Vanotti Suggested-by: Linus Torvalds Cc: stable@vger.kernel.org Cc: Eric Biederman Cc: Alexander Viro Cc: Christian Brauner Signed-off-by: Kees Cook --- fs/exec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index a126e3d1cacb..50e76cc633c4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1692,6 +1692,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; + int err; if (!mnt_may_suid(file->f_path.mnt)) return; @@ -1708,12 +1709,17 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) /* Be careful if suid/sgid is set */ inode_lock(inode); - /* reload atomically mode/uid/gid now that lock held */ + /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); + /* Did the exec bit vanish out from under us? Give up. */ + if (err) + return; + /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) From b739dffa5d570b411d4bdf4bb9b8dfd6b7d72305 Mon Sep 17 00:00:00 2001 From: Stefan Wiehler Date: Mon, 12 Aug 2024 12:06:51 +0200 Subject: [PATCH 0847/1052] of/irq: Prevent device address out-of-bounds read in interrupt map walk When of_irq_parse_raw() is invoked with a device address smaller than the interrupt parent node (from #address-cells property), KASAN detects the following out-of-bounds read when populating the initial match table (dyndbg="func of_irq_parse_* +p"): OF: of_irq_parse_one: dev=/soc@0/picasso/watchdog, index=0 OF: parent=/soc@0/pci@878000000000/gpio0@17,0, intsize=2 OF: intspec=4 OF: of_irq_parse_raw: ipar=/soc@0/pci@878000000000/gpio0@17,0, size=2 OF: -> addrsize=3 ================================================================== BUG: KASAN: slab-out-of-bounds in of_irq_parse_raw+0x2b8/0x8d0 Read of size 4 at addr ffffff81beca5608 by task bash/764 CPU: 1 PID: 764 Comm: bash Tainted: G O 6.1.67-484c613561-nokia_sm_arm64 #1 Hardware name: Unknown Unknown Product/Unknown Product, BIOS 2023.01-12.24.03-dirty 01/01/2023 Call trace: dump_backtrace+0xdc/0x130 show_stack+0x1c/0x30 dump_stack_lvl+0x6c/0x84 print_report+0x150/0x448 kasan_report+0x98/0x140 __asan_load4+0x78/0xa0 of_irq_parse_raw+0x2b8/0x8d0 of_irq_parse_one+0x24c/0x270 parse_interrupts+0xc0/0x120 of_fwnode_add_links+0x100/0x2d0 fw_devlink_parse_fwtree+0x64/0xc0 device_add+0xb38/0xc30 of_device_add+0x64/0x90 of_platform_device_create_pdata+0xd0/0x170 of_platform_bus_create+0x244/0x600 of_platform_notify+0x1b0/0x254 blocking_notifier_call_chain+0x9c/0xd0 __of_changeset_entry_notify+0x1b8/0x230 __of_changeset_apply_notify+0x54/0xe4 of_overlay_fdt_apply+0xc04/0xd94 ... The buggy address belongs to the object at ffffff81beca5600 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 8 bytes inside of 128-byte region [ffffff81beca5600, ffffff81beca5680) The buggy address belongs to the physical page: page:00000000230d3d03 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1beca4 head:00000000230d3d03 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x8000000000010200(slab|head|zone=2) raw: 8000000000010200 0000000000000000 dead000000000122 ffffff810000c300 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffffff81beca5500: 04 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffffff81beca5580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffffff81beca5600: 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffffff81beca5680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffffff81beca5700: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc ================================================================== OF: -> got it ! Prevent the out-of-bounds read by copying the device address into a buffer of sufficient size. Signed-off-by: Stefan Wiehler Link: https://lore.kernel.org/r/20240812100652.3800963-1-stefan.wiehler@nokia.com Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index c94203ce65bb..8fd63100ba8f 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -344,7 +344,8 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar struct device_node *p; const __be32 *addr; u32 intsize; - int i, res; + int i, res, addr_len; + __be32 addr_buf[3] = { 0 }; pr_debug("of_irq_parse_one: dev=%pOF, index=%d\n", device, index); @@ -353,13 +354,19 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar return of_irq_parse_oldworld(device, index, out_irq); /* Get the reg property (if any) */ - addr = of_get_property(device, "reg", NULL); + addr = of_get_property(device, "reg", &addr_len); + + /* Prevent out-of-bounds read in case of longer interrupt parent address size */ + if (addr_len > (3 * sizeof(__be32))) + addr_len = 3 * sizeof(__be32); + if (addr) + memcpy(addr_buf, addr, addr_len); /* Try the new-style interrupts-extended first */ res = of_parse_phandle_with_args(device, "interrupts-extended", "#interrupt-cells", index, out_irq); if (!res) - return of_irq_parse_raw(addr, out_irq); + return of_irq_parse_raw(addr_buf, out_irq); /* Look for the interrupt parent. */ p = of_irq_find_parent(device); @@ -389,7 +396,7 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar /* Check if there are any interrupt-map translations to process */ - res = of_irq_parse_raw(addr, out_irq); + res = of_irq_parse_raw(addr_buf, out_irq); out: of_node_put(p); return res; From 4e91fa1ef3ce6290b4c598e54b5eb6cf134fbec8 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Mon, 12 Aug 2024 21:40:28 +0200 Subject: [PATCH 0848/1052] i2c: qcom-geni: Add missing geni_icc_disable in geni_i2c_runtime_resume Add the missing geni_icc_disable() call before returning in the geni_i2c_runtime_resume() function. Commit 9ba48db9f77c ("i2c: qcom-geni: Add missing geni_icc_disable in geni_i2c_runtime_resume") by Gaosheng missed disabling the interconnect in one case. Fixes: bf225ed357c6 ("i2c: i2c-qcom-geni: Add interconnect support") Cc: Gaosheng Cui Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-qcom-geni.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 365e37bba0f3..06e836e3e877 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -986,8 +986,10 @@ static int __maybe_unused geni_i2c_runtime_resume(struct device *dev) return ret; ret = clk_prepare_enable(gi2c->core_clk); - if (ret) + if (ret) { + geni_icc_disable(&gi2c->se); return ret; + } ret = geni_se_resources_on(&gi2c->se); if (ret) { From 655111b838cdabdb604f3625a9ff08c5eedb11da Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Mon, 12 Aug 2024 08:51:23 +0200 Subject: [PATCH 0849/1052] mptcp: correct MPTCP_SUBFLOW_ATTR_SSN_OFFSET reserved size ssn_offset field is u32 and is placed into the netlink response with nla_put_u32(), but only 2 bytes are reserved for the attribute payload in subflow_get_info_size() (even though it makes no difference in the end, as it is aligned up to 4 bytes). Supply the correct argument to the relevant nla_total_size() call to make it less confusing. Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace") Signed-off-by: Eugene Syromiatnikov Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240812065024.GA19719@asgard.redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 3ae46b545d2c..2d3efb405437 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -94,7 +94,7 @@ static size_t subflow_get_info_size(const struct sock *sk) nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ - nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ From a24e6e7146e361aa0855cf8ee3b2e80b8eb692e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 22:40:39 -0400 Subject: [PATCH 0850/1052] bcachefs: delete faulty fastpath in bch2_btree_path_traverse_cached() bch2_btree_path_traverse_cached() was previously checking if it could just relock the path, which is a common idiom in path traversal. However, it was using btree_node_relock(), not btree_path_relock(); btree_path_relock() only succeeds if the path was in state BTREE_ITER_NEED_RELOCK. If the path was in state BTREE_ITER_NEED_TRAVERSE a full traversal is needed; this led to a null ptr deref in bch2_btree_path_traverse_cached(). And the short circuit check here isn't needed, since it was already done in the main bch2_btree_path_traverse_one(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index f2f2e525460b..79954490627c 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -497,11 +497,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path path->l[1].b = NULL; - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - path->uptodate = BTREE_ITER_UPTODATE; - return 0; - } - int ret; do { ret = btree_path_traverse_cached_fast(trans, path); From bd864bc2d90790e00b02b17c75fb951cb4b0bb8b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 23:24:03 -0400 Subject: [PATCH 0851/1052] bcachefs: Fix bch2_trigger_alloc when upgrading from old versions bch2_trigger_alloc was assuming that the new key would always be newly created and thus always an alloc_v4 key, but - not when called from btree_gc. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index d9c5a92fa708..0a8a1bc9a4ac 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -829,7 +829,19 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; + + struct bch_alloc_v4 *new_a; + if (likely(new.k->type == KEY_TYPE_alloc_v4)) { + new_a = bkey_s_to_alloc_v4(new).v; + } else { + BUG_ON(!(flags & BTREE_TRIGGER_gc)); + + struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); + ret = PTR_ERR_OR_ZERO(new_ka); + if (unlikely(ret)) + goto err; + new_a = &new_ka->v; + } if (flags & BTREE_TRIGGER_transactional) { alloc_data_type_set(new_a, new_a->data_type); From d9e615762bf2eb7459fb0f270525f8b186bce6b7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 04:53:12 -0400 Subject: [PATCH 0852/1052] bcachefs: bch2_accounting_invalid() fixup Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 046ac92b6639..212f53927111 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -154,7 +154,7 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, "accounting key replicas entry with bad nr_required"); for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) - bkey_fsck_err_on(acc_k.replicas.devs[i] > acc_k.replicas.devs[i + 1], + bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], c, err, accounting_key_replicas_devs_unsorted, "accounting key replicas entry with unsorted devs"); From 486d920735325e507d965c2639ba2775b81fd329 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 22:47:55 -0400 Subject: [PATCH 0853/1052] bcachefs: disk accounting: ignore unknown types forward compat fix Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 212f53927111..03a9de6c2e0a 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -528,6 +528,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, e->pos); + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + continue; + u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; @@ -760,6 +763,12 @@ void bch2_verify_accounting_clean(struct bch_fs *c) struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); unsigned nr = bch2_accounting_counters(k.k); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + continue; + bch2_accounting_mem_read(c, k.k->p, v, nr); if (memcmp(a.v->d, v, nr * sizeof(u64))) { @@ -775,9 +784,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; } - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; From 48d6cc1b4895ada0781da11a0a483332a236ec14 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 01:01:35 -0400 Subject: [PATCH 0854/1052] bcachefs: Add missing downgrade table entry Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 6c4469f53313..9f82d497d9e0 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -104,6 +104,7 @@ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_accounting_replicas_not_marked, \ BCH_FSCK_ERR_bkey_version_in_future) struct upgrade_downgrade_entry { From 968feb854a86b59cc4bc72af3105989706ca2c7d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2024 16:34:28 -0400 Subject: [PATCH 0855/1052] bcachefs: Convert for_each_btree_node() to lockrestart_do() for_each_btree_node() now works similarly to for_each_btree_key(), where the loop body is passed as an argument to be passed to lockrestart_do(). This now calls trans_begin() on every loop iteration - which fixes an SRCU warning in backpointers fsck. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 15 +++++--------- fs/bcachefs/btree_iter.c | 1 + fs/bcachefs/btree_iter.h | 42 ++++++++++++++++++++++++-------------- fs/bcachefs/debug.c | 38 ++++++++-------------------------- 4 files changed, 42 insertions(+), 54 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 3cc02479a982..9edc4c5f735c 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -763,27 +763,22 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, btree < BTREE_ID_NR && !ret; btree++) { unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; - struct btree_iter iter; - struct btree *b; if (!(BIT_ULL(btree) & btree_leaf_mask) && !(BIT_ULL(btree) & btree_interior_mask)) continue; - bch2_trans_begin(trans); - - __for_each_btree_node(trans, iter, btree, + ret = __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_prefetch, b, ret) { + 0, depth, BTREE_ITER_prefetch, b, ({ mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = BBPOS(btree, b->key.k.p); - bch2_trans_iter_exit(trans, &iter); - return 0; + break; } - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } return ret; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index aa8a049071f4..2e84d22e17bd 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1900,6 +1900,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) goto out; } +/* Only kept for -tools */ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) { struct btree *b; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index c7725865309c..dca62375d7d3 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -600,23 +600,35 @@ void bch2_trans_srcu_unlock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); -/* - * XXX - * this does not handle transaction restarts from bch2_btree_iter_next_node() - * correctly - */ -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b, _ret) \ - for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ - _start, _locks_want, _depth, _flags); \ - (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ - !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ - (_b) = bch2_btree_iter_next_node(&(_iter))) +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _do) \ +({ \ + bch2_trans_begin((_trans)); \ + \ + struct btree_iter _iter; \ + bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + int _ret3 = 0; \ + do { \ + _ret3 = lockrestart_do((_trans), ({ \ + struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + if (!_b) \ + break; \ + \ + PTR_ERR_OR_ZERO(_b) ?: (_do); \ + })) ?: \ + lockrestart_do((_trans), \ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + } while (!_ret3); \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b, _ret) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _ret) + _flags, _b, _do) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _do) static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index ebabab171fe5..45aec1afdb0e 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -397,47 +397,27 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct btree *b; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); + ssize_t ret = flush_buf(i); if (ret) return ret; if (bpos_eq(SPOS_MAX, i->from)) return i->ret; - trans = bch2_trans_get(i->c); -retry: - bch2_trans_begin(trans); + return bch2_trans_run(i->c, + for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ + bch2_btree_node_to_text(&i->buf, i->c, b); + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) { - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; - - ret = drop_locks_do(trans, flush_buf(i)); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + drop_locks_do(trans, flush_buf(i)); + }))) ?: i->ret; } static const struct file_operations btree_format_debug_ops = { From b2f11c6f3e1fc60742673b8675c95b78447f3dae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 21:04:35 -0400 Subject: [PATCH 0856/1052] lib/generic-radix-tree.c: Fix rare race in __genradix_ptr_alloc() If we need to increase the tree depth, allocate a new node, and then race with another thread that increased the tree depth before us, we'll still have a preallocated node that might be used later. If we then use that node for a new non-root node, it'll still have a pointer to the old root instead of being zeroed - fix this by zeroing it in the cmpxchg failure path. Signed-off-by: Kent Overstreet --- lib/generic-radix-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index aaefb9b678c8..fa692c86f069 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -121,6 +121,8 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; + } else { + new_node->children[0] = NULL; } } From 7254555c440ff6b136aa97fb3c33fd5e0bb4fb9f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 14:40:09 -0400 Subject: [PATCH 0857/1052] bcachefs: Add hysteresis to waiting on btree key cache flush This helps ensure key cache reclaim isn't contending with threads waiting for the key cache to be helped, and fixes a severe performance bug. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.h | 9 +++++++++ fs/bcachefs/btree_trans_commit.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index e6b2cd0dd2c1..113309f62918 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -20,6 +20,15 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) return nr_dirty > max_dirty; } +static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 2048 + (nr_keys * 5) / 8; + + return nr_dirty <= max_dirty; +} + int bch2_btree_key_cache_journal_flush(struct journal *, struct journal_entry_pin *, u64); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index cca336fe46e9..f567bfb82850 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -927,7 +927,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags static int journal_reclaim_wait_done(struct bch_fs *c) { int ret = bch2_journal_error(&c->journal) ?: - !bch2_btree_key_cache_must_wait(c); + bch2_btree_key_cache_wait_done(c); if (!ret) journal_reclaim_kick(&c->journal); From 790666c8ac6427ddaa00f502dc44073c1e039355 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 14:31:17 -0400 Subject: [PATCH 0858/1052] bcachefs: Improve trans_blocked_journal_reclaim tracepoint include information about the state of the btree key cache Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.h | 9 +++++++-- fs/bcachefs/trace.c | 1 + fs/bcachefs/trace.h | 27 +++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index 113309f62918..51d6289b8dee 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -11,13 +11,18 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) return max_t(ssize_t, 0, nr_dirty - max_dirty); } -static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c) { size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty; + return nr_dirty - max_dirty; +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + return __bch2_btree_key_cache_must_wait(c) > 0; } static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c index dc48b52b01b4..dfad1d06633d 100644 --- a/fs/bcachefs/trace.c +++ b/fs/bcachefs/trace.c @@ -4,6 +4,7 @@ #include "buckets.h" #include "btree_cache.h" #include "btree_iter.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "keylist.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index d0e6b9deb6cb..c62f00322d1e 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -988,10 +988,33 @@ TRACE_EVENT(trans_restart_split_race, __entry->u64s_remaining) ); -DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, +TRACE_EVENT(trans_blocked_journal_reclaim, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + + __field(unsigned long, key_cache_nr_keys ) + __field(unsigned long, key_cache_nr_dirty ) + __field(long, must_wait ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); + __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); + __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); + ), + + TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->key_cache_nr_keys, + __entry->key_cache_nr_dirty, + __entry->must_wait) ); TRACE_EVENT(trans_restart_journal_preres_get, From 06a8693b890c0cf7d94bf7c6f0e2adf3a3aaa346 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 15:48:18 -0400 Subject: [PATCH 0859/1052] bcachefs: Add a time_stat for blocked on key cache flush Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/btree_trans_commit.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index eedf2d6045e7..0c7086e00d18 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -447,6 +447,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ x(blocked_write_buffer_full) \ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index f567bfb82850..ac0c92683aad 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -973,9 +973,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, bch2_trans_unlock(trans); trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); + + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); + if (ret < 0) break; From c99471024f24b3cbafc02bf5b112ecf34b0dbd40 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 23:29:46 -0400 Subject: [PATCH 0860/1052] bcachefs: Fix warning in __bch2_fsck_err() for trans not passed in Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2c424435ca4a..70ebcca08ba2 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1767,6 +1767,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); + /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { From d97de0d017cde0d442c3d144b4f969f43064cc0f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 21:31:25 -0400 Subject: [PATCH 0861/1052] bcachefs: Make bkey_fsck_err() a wrapper around fsck_err() bkey_fsck_err() was added as an interface that looks like fsck_err(), but previously all it did was ensure that the appropriate error counter was incremented in the superblock. This is a cleanup and bugfix patch that converts it to a wrapper around fsck_err(). This is needed to fix an issue with the upgrade path to disk_accounting_v3, where the "silent fix" error list now includes bkey_fsck errors; fsck_err() handles this in a unified way, and since we need to change printing of bkey fsck errors from the caller to the inner bkey_fsck_err() calls, this ends up being a pretty big change. Als,, rename .invalid() methods to .validate(), for clarity, while we're changing the function signature anyways (to drop the printbuf argument). Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 63 ++++++------ fs/bcachefs/alloc_background.h | 26 +++-- fs/bcachefs/backpointers.c | 8 +- fs/bcachefs/backpointers.h | 5 +- fs/bcachefs/bkey.h | 7 +- fs/bcachefs/bkey_methods.c | 109 ++++++++++----------- fs/bcachefs/bkey_methods.h | 21 ++-- fs/bcachefs/btree_io.c | 67 ++++--------- fs/bcachefs/btree_node_scan.c | 2 +- fs/bcachefs/btree_trans_commit.c | 72 +++----------- fs/bcachefs/btree_update_interior.c | 16 +--- fs/bcachefs/data_update.c | 6 +- fs/bcachefs/dirent.c | 33 ++++--- fs/bcachefs/dirent.h | 5 +- fs/bcachefs/disk_accounting.c | 13 ++- fs/bcachefs/disk_accounting.h | 5 +- fs/bcachefs/ec.c | 15 ++- fs/bcachefs/ec.h | 5 +- fs/bcachefs/errcode.h | 1 + fs/bcachefs/error.c | 22 +++++ fs/bcachefs/error.h | 39 ++++---- fs/bcachefs/extents.c | 144 ++++++++++++++-------------- fs/bcachefs/extents.h | 24 ++--- fs/bcachefs/inode.c | 77 +++++++-------- fs/bcachefs/inode.h | 24 ++--- fs/bcachefs/journal_io.c | 24 +---- fs/bcachefs/lru.c | 9 +- fs/bcachefs/lru.h | 5 +- fs/bcachefs/quota.c | 8 +- fs/bcachefs/quota.h | 5 +- fs/bcachefs/reflink.c | 19 ++-- fs/bcachefs/reflink.h | 22 ++--- fs/bcachefs/snapshot.c | 42 ++++---- fs/bcachefs/snapshot.h | 11 +-- fs/bcachefs/subvolume.c | 16 ++-- fs/bcachefs/subvolume.h | 5 +- fs/bcachefs/xattr.c | 21 ++-- fs/bcachefs/xattr.h | 5 +- 38 files changed, 448 insertions(+), 553 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 0a8a1bc9a4ac..fd3a2522bc3e 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -196,75 +196,71 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) return DIV_ROUND_UP(bytes, sizeof(u64)); } -int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; /* allow for unknown fields */ - bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, - alloc_v1_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), + c, alloc_v1_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); fsck_err: return ret; } -int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int ret = 0; - bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err, - alloc_v4_val_size_bad, + bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), + c, alloc_v4_val_size_bad, "bad val size (%u > %zu)", alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, - alloc_v4_backpointers_start_bad, + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), + c, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); - bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, - alloc_key_data_type_bad, + bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, + c, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); for (unsigned i = 0; i < 2; i++) bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, - c, err, - alloc_key_io_time_bad, + c, alloc_key_io_time_bad, "invalid io_time[%s]: %llu, max %llu", i == READ ? "read" : "write", a.v->io_time[i], LRU_TIME_MAX); @@ -282,7 +278,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, a.v->dirty_sectors || a.v->cached_sectors || a.v->stripe, - c, err, alloc_key_empty_but_have_data, + c, alloc_key_empty_but_have_data, "empty data type free but have data %u.%u.%u %u", stripe_sectors, a.v->dirty_sectors, @@ -296,7 +292,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_parity: bkey_fsck_err_on(!a.v->dirty_sectors && !stripe_sectors, - c, err, alloc_key_dirty_sectors_0, + c, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", bch2_data_type_str(a.v->data_type)); break; @@ -305,12 +301,12 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, a.v->dirty_sectors || stripe_sectors || a.v->stripe, - c, err, alloc_key_cached_inconsistency, + c, alloc_key_cached_inconsistency, "data type inconsistency"); bkey_fsck_err_on(!a.v->io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, - c, err, alloc_key_cached_but_read_time_zero, + c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: @@ -513,14 +509,13 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) : 0; } -int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, - bucket_gens_val_size_bad, + bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), + c, bucket_gens_val_size_bad, "bad val size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); fsck_err: diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 96a0444ea78f..260e7fa83d05 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -240,52 +240,48 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v1_invalid, \ + .key_validate = bch2_alloc_v1_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v2_invalid, \ + .key_validate = bch2_alloc_v2_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v3_invalid, \ + .key_validate = bch2_alloc_v3_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v4_invalid, \ + .key_validate = bch2_alloc_v4_validate, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) -int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ - .key_invalid = bch2_bucket_gens_invalid, \ + .key_validate = bch2_bucket_gens_validate, \ .val_to_text = bch2_bucket_gens_to_text, \ }) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 9edc4c5f735c..d4da6343efa9 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -47,9 +47,8 @@ static bool extent_matches_bp(struct bch_fs *c, return false; } -int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); @@ -68,8 +67,7 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || !bpos_eq(bp.k->p, bp_pos), - c, err, - backpointer_bucket_offset_wrong, + c, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); fsck_err: return ret; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 6021de1c5e98..7daecadb764e 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -18,14 +18,13 @@ static inline u64 swab40(u64 x) ((x & 0xff00000000ULL) >> 32)); } -int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, - enum bch_validate_flags, struct printbuf *); +int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_invalid = bch2_backpointer_invalid, \ + .key_validate = bch2_backpointer_validate, \ .val_to_text = bch2_backpointer_k_to_text, \ .swab = bch2_backpointer_swab, \ .min_val_size = 32, \ diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 936357149cf0..e34cb2bf329c 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -10,9 +10,10 @@ #include "vstructs.h" enum bch_validate_flags { - BCH_VALIDATE_write = (1U << 0), - BCH_VALIDATE_commit = (1U << 1), - BCH_VALIDATE_journal = (1U << 2), + BCH_VALIDATE_write = BIT(0), + BCH_VALIDATE_commit = BIT(1), + BCH_VALIDATE_journal = BIT(2), + BCH_VALIDATE_silent = BIT(3), }; #if 0 diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5f07cf853d0c..88d8958281e8 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -27,27 +27,27 @@ const char * const bch2_bkey_types[] = { NULL }; -static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } #define bch2_bkey_ops_deleted ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) #define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) -static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, - bkey_val_size_nonzero, + bkey_fsck_err_on(bkey_val_bytes(k.k), + c, bkey_val_size_nonzero, "incorrect value size (%zu != 0)", bkey_val_bytes(k.k)); fsck_err: @@ -55,11 +55,11 @@ static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, } #define bch2_bkey_ops_error ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } @@ -73,17 +73,17 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, } #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_invalid = key_type_cookie_invalid, \ + .key_validate = key_type_cookie_validate, \ .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } @@ -98,9 +98,9 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, datalen, min(datalen, 32U), d.v->data); } -#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ - .key_invalid = key_type_inline_data_invalid, \ - .val_to_text = key_type_inline_data_to_text, \ +#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ + .key_validate = key_type_inline_data_validate, \ + .val_to_text = key_type_inline_data_to_text, \ }) static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) @@ -110,7 +110,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ } #define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ .key_merge = key_type_set_merge, \ }) @@ -123,9 +123,8 @@ const struct bkey_ops bch2_bkey_ops[] = { const struct bkey_ops bch2_bkey_null_ops = { }; -int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; @@ -133,15 +132,15 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, - bkey_val_size_too_small, + bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, + c, bkey_val_size_too_small, "bad val size (%zu < %u)", bkey_val_bytes(k.k), ops->min_val_size); - if (!ops->key_invalid) + if (!ops->key_validate) return 0; - ret = ops->key_invalid(c, k, flags, err); + ret = ops->key_validate(c, k, flags); fsck_err: return ret; } @@ -161,18 +160,17 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); } -int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bch_validate_flags flags, - struct printbuf *err) +int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bch_validate_flags flags) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; int ret = 0; - bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, - bkey_u64s_too_small, + bkey_fsck_err_on(k.k->u64s < BKEY_U64s, + c, bkey_u64s_too_small, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); if (type >= BKEY_TYPE_NR) @@ -180,8 +178,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, - bkey_invalid_type_for_btree, + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), + c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", bch2_btree_node_type_str(type), k.k->type < KEY_TYPE_MAX @@ -189,17 +187,17 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - bkey_fsck_err_on(k.k->size == 0, c, err, - bkey_extent_size_zero, + bkey_fsck_err_on(k.k->size == 0, + c, bkey_extent_size_zero, "size == 0"); - bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, - bkey_extent_size_greater_than_offset, + bkey_fsck_err_on(k.k->size > k.k->p.offset, + c, bkey_extent_size_greater_than_offset, "size greater than offset (%u > %llu)", k.k->size, k.k->p.offset); } else { - bkey_fsck_err_on(k.k->size, c, err, - bkey_size_nonzero, + bkey_fsck_err_on(k.k->size, + c, bkey_size_nonzero, "size != 0"); } @@ -207,12 +205,12 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_id btree = type - 1; if (btree_type_has_snapshots(btree)) { - bkey_fsck_err_on(!k.k->p.snapshot, c, err, - bkey_snapshot_zero, + bkey_fsck_err_on(!k.k->p.snapshot, + c, bkey_snapshot_zero, "snapshot == 0"); } else if (!btree_type_has_snapshot_field(btree)) { - bkey_fsck_err_on(k.k->p.snapshot, c, err, - bkey_snapshot_nonzero, + bkey_fsck_err_on(k.k->p.snapshot, + c, bkey_snapshot_nonzero, "nonzero snapshot"); } else { /* @@ -221,34 +219,33 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, */ } - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, - bkey_at_pos_max, + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), + c, bkey_at_pos_max, "key at POS_MAX"); } fsck_err: return ret; } -int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, +int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bch_validate_flags flags, - struct printbuf *err) + enum bch_validate_flags flags) { - return __bch2_bkey_invalid(c, k, type, flags, err) ?: - bch2_bkey_val_invalid(c, k, flags, err); + return __bch2_bkey_validate(c, k, type, flags) ?: + bch2_bkey_val_validate(c, k, flags); } int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, struct printbuf *err) + struct bkey_s_c k, enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, - bkey_before_start_of_btree_node, + bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), + c, bkey_before_start_of_btree_node, "key before start of btree node"); - bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, - bkey_after_end_of_btree_node, + bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), + c, bkey_after_end_of_btree_node, "key past end of btree node"); fsck_err: return ret; diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index baef0722f5fb..3df3dd2723a1 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -14,15 +14,15 @@ extern const char * const bch2_bkey_types[]; extern const struct bkey_ops bch2_bkey_null_ops; /* - * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If * invalid, entire key will be deleted. * * When invalid, error string is returned via @err. @rw indicates whether key is * being read or written; more aggressive checks can be enabled when rw == WRITE. */ struct bkey_ops { - int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err); + int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); @@ -48,14 +48,13 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) : &bch2_bkey_null_ops; } -int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags, struct printbuf *); -int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags, struct printbuf *); -int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, - struct bkey_s_c, struct printbuf *); +int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bch_validate_flags); +int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bch_validate_flags); +int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, + enum bch_validate_flags); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 70ebcca08ba2..56ea9a77cd4a 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -836,14 +836,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, return ret; } -static int bset_key_invalid(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, int rw, - struct printbuf *err) +static int bset_key_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, int rw) { - return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: - (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); + return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: + (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); } static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, @@ -858,12 +857,9 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, if (!bkeyp_u64s_valid(&b->format, k)) return false; - struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); - printbuf_exit(&buf); - return ret; + return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); } static int validate_bset_keys(struct bch_fs *c, struct btree *b, @@ -915,19 +911,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { - printbuf_reset(&buf); - bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "invalid bkey: %s", buf.buf); + ret = bset_key_validate(c, b, u.s_c, updated_range, write); + if (ret == -BCH_ERR_fsck_delete_bkey) goto drop_this_key; - } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, @@ -1228,23 +1216,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - - if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + ret = bch2_bkey_val_validate(c, u.s_c, READ); + if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { - printbuf_reset(&buf); - - prt_printf(&buf, "invalid bkey: "); - bch2_bkey_val_invalid(c, u.s_c, READ, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "%s", buf.buf); - btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -1253,6 +1228,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset_end(b, b->set); continue; } + if (ret) + goto fsck_err; if (u.k->type == KEY_TYPE_btree_ptr_v2) { struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); @@ -1954,18 +1931,14 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - struct printbuf buf = PRINTBUF; bool saw_error; - int ret; - ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), - BKEY_TYPE_btree, WRITE, &buf); - - if (ret) - bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); - printbuf_exit(&buf); - if (ret) + int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), + BKEY_TYPE_btree, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "invalid btree node key before write"); return ret; + } ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 001107226377..b28c649c6838 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -530,7 +530,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); printbuf_exit(&buf); - BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); + BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); if (ret) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index ac0c92683aad..1a1e9d2036da 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -818,50 +818,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bch_validate_flags flags, - struct btree_insert_entry *i, - struct printbuf *err) -{ - struct bch_fs *c = trans->c; - - printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - printbuf_indent_add(err, 2); - - bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); - prt_newline(err); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); - bch2_print_string_as_lines(KERN_ERR, err->buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - -static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, - struct jset_entry *i) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn); - printbuf_indent_add(&buf, 2); - - bch2_journal_entry_to_text(&buf, c, i); - prt_newline(&buf); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - static int bch2_trans_commit_journal_pin_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { @@ -1064,20 +1020,19 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); return ret; + } + btree_insert_entry_checks(trans, i); } for (struct jset_entry *i = trans->journal_entries; @@ -1088,13 +1043,14 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - if (unlikely(bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags))) - ret = bch2_trans_commit_journal_entry_invalid(trans, i); - - if (ret) + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); return ret; + } } if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e61f9695771e..b3454d4619e8 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1364,18 +1364,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { - printbuf_reset(&buf); - prt_printf(&buf, "inserting invalid bkey\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_printf(&buf, "\n "); - bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf); - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); - - bch2_fs_inconsistent(c, "%s", buf.buf); + if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), + btree_node_type(b), BCH_VALIDATE_write) ?: + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) { + bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); dump_stack(); } diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0087b8555ead..6a854c918496 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -250,10 +250,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * it's been hard to reproduce, so this should give us some more * information when it does occur: */ - struct printbuf err = PRINTBUF; - int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); - printbuf_exit(&err); - + int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), + BCH_VALIDATE_commit); if (invalid) { struct printbuf buf = PRINTBUF; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d743da89308e..32bfdf19289a 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -100,20 +100,19 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .is_visible = dirent_is_visible, }; -int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); int ret = 0; - bkey_fsck_err_on(!d_name.len, c, err, - dirent_empty_name, + bkey_fsck_err_on(!d_name.len, + c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, - dirent_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + c, dirent_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); @@ -121,27 +120,27 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err, - dirent_name_too_long, + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, + c, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); - bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, - dirent_name_embedded_nul, + bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), + c, dirent_name_embedded_nul, "dirent has stray data after name's NUL"); bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || - (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, - dirent_name_dot_or_dotdot, + (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), + c, dirent_name_dot_or_dotdot, "invalid name"); - bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, - dirent_name_has_slash, + bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), + c, dirent_name_has_slash, "name with /"); bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, - dirent_to_itself, + le64_to_cpu(d.v->d_inum) == d.k->p.inode, + c, dirent_to_itself, "dirent points to own directory"); fsck_err: return ret; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 24037e6e0a09..8945145865c5 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -7,12 +7,11 @@ enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ - .key_invalid = bch2_dirent_invalid, \ + .key_validate = bch2_dirent_validate, \ .val_to_text = bch2_dirent_to_text, \ .min_val_size = 16, \ }) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 03a9de6c2e0a..f059cbffdf23 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -126,9 +126,8 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) -int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); @@ -144,18 +143,18 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, break; case BCH_DISK_ACCOUNTING_replicas: bkey_fsck_err_on(!acc_k.replicas.nr_devs, - c, err, accounting_key_replicas_nr_devs_0, + c, accounting_key_replicas_nr_devs_0, "accounting key replicas entry with nr_devs=0"); bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || (acc_k.replicas.nr_required > 1 && acc_k.replicas.nr_required == acc_k.replicas.nr_devs), - c, err, accounting_key_replicas_nr_required_bad, + c, accounting_key_replicas_nr_required_bad, "accounting key replicas entry with bad nr_required"); for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], - c, err, accounting_key_replicas_devs_unsorted, + c, accounting_key_replicas_devs_unsorted, "accounting key replicas entry with unsorted devs"); end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); @@ -178,7 +177,7 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, } bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), - c, err, accounting_key_junk_at_end, + c, accounting_key_junk_at_end, "junk at end of accounting key"); fsck_err: return ret; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 3d3f25e08b69..b92f8c2e3054 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -82,14 +82,13 @@ int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); -int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_swab(struct bkey_s); #define bch2_bkey_ops_accounting ((struct bkey_ops) { \ - .key_invalid = bch2_accounting_invalid, \ + .key_validate = bch2_accounting_validate, \ .val_to_text = bch2_accounting_to_text, \ .swab = bch2_accounting_swab, \ .min_val_size = 8, \ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 84f1cbf6497f..141a4c63142f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -107,24 +107,23 @@ struct ec_bio { /* Stripes btree keys: */ -int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; int ret = 0; bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || - bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, - stripe_pos_bad, + bpos_gt(k.k->p, POS(0, U32_MAX)), + c, stripe_pos_bad, "stripe at bad pos"); - bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, - stripe_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), + c, stripe_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 84a23eeb6249..90962b3c0130 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -8,8 +8,7 @@ enum bch_validate_flags; -int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, @@ -17,7 +16,7 @@ int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ - .key_invalid = bch2_stripe_invalid, \ + .key_validate = bch2_stripe_validate, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_stripe, \ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a268af3e52bf..ab5a7adece10 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -166,6 +166,7 @@ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index a62b63108820..95afa7bf2020 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -416,6 +416,28 @@ int __bch2_fsck_err(struct bch_fs *c, return ret; } +int __bch2_bkey_fsck_err(struct bch_fs *c, + struct bkey_s_c k, + enum bch_fsck_flags flags, + enum bch_sb_error_id err, + const char *fmt, ...) +{ + struct printbuf buf = PRINTBUF; + va_list args; + + prt_str(&buf, "invalid bkey "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\n "); + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + prt_str(&buf, ": delete?"); + + int ret = __bch2_fsck_err(c, NULL, flags, err, "%s", buf.buf); + printbuf_exit(&buf); + return ret; +} + void bch2_flush_fsck_errs(struct bch_fs *c) { struct fsck_err_state *s, *n; diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 995e6bba9bad..2f1b86978f36 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -4,6 +4,7 @@ #include #include +#include "bkey_types.h" #include "sb-errors.h" struct bch_dev; @@ -166,24 +167,30 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -__printf(4, 0) -static inline void bch2_bkey_fsck_err(struct bch_fs *c, - struct printbuf *err_msg, - enum bch_sb_error_id err_type, - const char *fmt, ...) -{ - va_list args; +__printf(5, 6) +int __bch2_bkey_fsck_err(struct bch_fs *, + struct bkey_s_c, + enum bch_fsck_flags, + enum bch_sb_error_id, + const char *, ...); - va_start(args, fmt); - prt_vprintf(err_msg, fmt, args); - va_end(args); -} - -#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ +/* + * for now, bkey fsck errors are always handled by deleting the entire key - + * this will change at some point + */ +#define bkey_fsck_err(c, _err_type, _err_msg, ...) \ do { \ - prt_printf(_err_msg, __VA_ARGS__); \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ - ret = -BCH_ERR_invalid_bkey; \ + if ((flags & BCH_VALIDATE_silent)) { \ + ret = -BCH_ERR_fsck_delete_bkey; \ + goto fsck_err; \ + } \ + int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \ + BCH_FSCK_ERR_##_err_type, \ + _err_msg, ##__VA_ARGS__); \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) \ + ret = _ret; \ + ret = -BCH_ERR_fsck_delete_bkey; \ goto fsck_err; \ } while (0) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 07973198e35f..4419ad3e454e 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -171,17 +171,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, - btree_ptr_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, + c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } @@ -192,28 +191,27 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, - c, err, btree_ptr_v2_val_too_big, + c, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), - c, err, btree_ptr_v2_min_key_bad, + c, btree_ptr_v2_min_key_bad, "min_key > key"); if (flags & BCH_VALIDATE_write) bkey_fsck_err_on(!bp.v->sectors_written, - c, err, btree_ptr_v2_written_0, + c, btree_ptr_v2_written_0, "sectors_written == 0"); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } @@ -399,15 +397,14 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0; - bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, - reservation_key_nr_replicas_invalid, + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, + c, reservation_key_nr_replicas_invalid, "invalid nr_replicas (%u)", r.v->nr_replicas); fsck_err: return ret; @@ -1102,14 +1099,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } - -static int extent_ptr_invalid(struct bch_fs *c, - struct bkey_s_c k, - enum bch_validate_flags flags, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata, - struct printbuf *err) +static int extent_ptr_validate(struct bch_fs *c, + struct bkey_s_c k, + enum bch_validate_flags flags, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) { int ret = 0; @@ -1128,28 +1123,27 @@ static int extent_ptr_invalid(struct bch_fs *c, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, - ptr_to_duplicate_device, + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, + c, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bkey_fsck_err_on(bucket >= nbuckets, c, err, - ptr_after_last_bucket, + bkey_fsck_err_on(bucket >= nbuckets, + c, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, nbuckets); - bkey_fsck_err_on(bucket < first_bucket, c, err, - ptr_before_first_bucket, + bkey_fsck_err_on(bucket < first_bucket, + c, ptr_before_first_bucket, "pointer before first bucket (%llu < %u)", bucket, first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err, - ptr_spans_multiple_buckets, + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, + c, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } -int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1164,25 +1158,24 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, - extent_ptrs_invalid_entry, - "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, + c, extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), c, err, - btree_ptr_has_non_ptr, + !extent_entry_is_ptr(entry), + c, btree_ptr_has_non_ptr, "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_invalid(c, k, flags, &entry->ptr, - size_ondisk, false, err); + ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false); if (ret) return ret; - bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, - ptr_cached_and_erasure_coded, + bkey_fsck_err_on(entry->ptr.cached && have_ec, + c, ptr_cached_and_erasure_coded, "cached, erasure coded ptr"); if (!entry->ptr.unwritten) @@ -1199,44 +1192,50 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, - ptr_crc_uncompressed_size_too_small, + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, + c, ptr_crc_uncompressed_size_too_small, "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, - ptr_crc_csum_type_unknown, + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), + c, ptr_crc_csum_type_unknown, "invalid checksum type"); - bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, - ptr_crc_compression_type_unknown, + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, + c, ptr_crc_compression_type_unknown, "invalid compression type"); if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; else if (nonce != crc.offset + crc.nonce) - bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + bkey_fsck_err(c, ptr_crc_nonce_mismatch, "incorrect nonce"); } - bkey_fsck_err_on(crc_since_last_ptr, c, err, - ptr_crc_redundant, + bkey_fsck_err_on(crc_since_last_ptr, + c, ptr_crc_redundant, "redundant crc entry"); crc_since_last_ptr = true; bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err, - ptr_crc_uncompressed_size_too_big, + (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), + c, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: - bkey_fsck_err_on(have_ec, c, err, - ptr_stripe_redundant, + bkey_fsck_err_on(have_ec, + c, ptr_stripe_redundant, "redundant stripe entry"); have_ec = true; break; case BCH_EXTENT_ENTRY_rebalance: { + /* + * this shouldn't be a fsck error, for forward + * compatibility; the rebalance code should just refetch + * the compression opt if it's unknown + */ +#if 0 const struct bch_extent_rebalance *r = &entry->rebalance; if (!bch2_compression_opt_valid(r->compression)) { @@ -1245,28 +1244,29 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, opt.type, opt.level); return -BCH_ERR_invalid_bkey; } +#endif break; } } } - bkey_fsck_err_on(!nr_ptrs, c, err, - extent_ptrs_no_ptrs, + bkey_fsck_err_on(!nr_ptrs, + c, extent_ptrs_no_ptrs, "no ptrs"); - bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, - extent_ptrs_too_many_ptrs, + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, + c, extent_ptrs_too_many_ptrs, "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); - bkey_fsck_err_on(have_written && have_unwritten, c, err, - extent_ptrs_written_and_unwritten, + bkey_fsck_err_on(have_written && have_unwritten, + c, extent_ptrs_written_and_unwritten, "extent with unwritten and written ptrs"); - bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, - extent_ptrs_unwritten, + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, + c, extent_ptrs_unwritten, "has unwritten ptrs"); - bkey_fsck_err_on(crc_since_last_ptr, c, err, - extent_ptrs_redundant_crc, + bkey_fsck_err_on(crc_since_last_ptr, + c, extent_ptrs_redundant_crc, "redundant crc entry"); - bkey_fsck_err_on(have_ec, c, err, - extent_ptrs_redundant_stripe, + bkey_fsck_err_on(have_ec, + c, extent_ptrs_redundant_stripe, "redundant stripe entry"); fsck_err: return ret; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index facdb8a86eec..1a6ddee48041 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -409,26 +409,26 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); #define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_invalid, \ + .key_validate = bch2_btree_ptr_validate, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_v2_invalid, \ + .key_validate = bch2_btree_ptr_v2_validate, \ .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ @@ -441,7 +441,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent ((struct bkey_ops) { \ - .key_invalid = bch2_bkey_ptrs_invalid, \ + .key_validate = bch2_bkey_ptrs_validate, \ .val_to_text = bch2_bkey_ptrs_to_text, \ .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ @@ -451,13 +451,13 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reservation ((struct bkey_ops) { \ - .key_invalid = bch2_reservation_invalid, \ + .key_validate = bch2_reservation_validate, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ .trigger = bch2_trigger_reservation, \ @@ -683,8 +683,8 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_ptr_swab(struct bkey_s); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 1e20020eadd1..2be6be33afa3 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -434,100 +434,98 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) return &inode_p->inode.k_i; } -static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) +static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bch_inode_unpacked unpacked; int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); - bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, - inode_pos_blockdev_range, + bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, + c, inode_pos_blockdev_range, "fs inode in blockdev range"); - bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, - inode_unpack_error, + bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), + c, inode_unpack_error, "invalid variable length fields"); - bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, - inode_checksum_type_invalid, + bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, + c, inode_checksum_type_invalid, "invalid data checksum type (%u >= %u", unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); bkey_fsck_err_on(unpacked.bi_compression && - !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, - inode_compression_type_invalid, + !bch2_compression_opt_valid(unpacked.bi_compression - 1), + c, inode_compression_type_invalid, "invalid compression opt %u", unpacked.bi_compression - 1); bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && - unpacked.bi_nlink != 0, c, err, - inode_unlinked_but_nlink_nonzero, + unpacked.bi_nlink != 0, + c, inode_unlinked_but_nlink_nonzero, "flagged as unlinked but bi_nlink != 0"); - bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, - inode_subvol_root_but_not_dir, + bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), + c, inode_subvol_root_but_not_dir, "subvolume root but not a directory"); fsck_err: return ret; } -int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; - bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } -int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); int ret = 0; - bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } -int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); int ret = 0; bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, - inode_v3_fields_start_bad, + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), + c, inode_v3_fields_start_bad, "invalid fields_start (got %llu, min %u max %zu)", INODEv3_FIELDS_START(inode.v), INODEv3_FIELDS_START_INITIAL, bkey_val_u64s(inode.k)); - bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } @@ -625,14 +623,13 @@ int bch2_trigger_inode(struct btree_trans *trans, return 0; } -int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); fsck_err: return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index da0e4a745099..f1fcb4c58039 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -9,12 +9,12 @@ enum bch_validate_flags; extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); +int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); +int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, @@ -22,21 +22,21 @@ int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ - .key_invalid = bch2_inode_invalid, \ + .key_validate = bch2_inode_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v2_invalid, \ + .key_validate = bch2_inode_v2_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v3_invalid, \ + .key_validate = bch2_inode_v3_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 48, \ @@ -49,12 +49,12 @@ static inline bool bkey_is_inode(const struct bkey *k) k->type == KEY_TYPE_inode_v3; } -int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ - .key_invalid = bch2_inode_generation_invalid, \ + .key_validate = bch2_inode_generation_validate, \ .val_to_text = bch2_inode_generation_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7a833a3f1c63..7664b68e6a15 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -332,7 +332,6 @@ static int journal_validate_key(struct bch_fs *c, { int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); - struct printbuf buf = PRINTBUF; int ret = 0; if (journal_entry_err_on(!k->k.u64s, @@ -368,34 +367,21 @@ static int journal_validate_key(struct bch_fs *c, bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf)) { - printbuf_reset(&buf); - journal_entry_err_msg(&buf, version, jset, entry); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - prt_newline(&buf); - bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf); - - mustfix_fsck_err(c, journal_entry_bkey_invalid, - "%s", buf.buf); - + ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write); + if (ret == -BCH_ERR_fsck_delete_bkey) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - - printbuf_exit(&buf); return FSCK_DELETED_KEY; } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); fsck_err: - printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 83b1586cb371..96f2f4f8c397 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -10,14 +10,13 @@ #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ -int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, - lru_entry_at_time_0, + bkey_fsck_err_on(!lru_pos_time(k.k->p), + c, lru_entry_at_time_0, "lru entry at time=0"); fsck_err: return ret; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 5bd8974a7f11..e6a7d8241bb8 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -33,14 +33,13 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); #define bch2_bkey_ops_lru ((struct bkey_ops) { \ - .key_invalid = bch2_lru_invalid, \ + .key_validate = bch2_lru_validate, \ .val_to_text = bch2_lru_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index a0cca8b70e0a..c32a05e252e2 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -59,13 +59,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { .to_text = bch2_sb_quota_to_text, }; -int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, - quota_type_invalid, + bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, + c, quota_type_invalid, "invalid quota type (%llu >= %u)", k.k->p.inode, QTYP_NR); fsck_err: diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 02d37a332218..a62abcc5332a 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -8,12 +8,11 @@ enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ - .key_invalid = bch2_quota_invalid, \ + .key_validate = bch2_quota_validate, \ .val_to_text = bch2_quota_to_text, \ .min_val_size = 32, \ }) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 5f92715e1525..e59c0abb4772 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -29,15 +29,14 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ -int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), - c, err, reflink_p_front_pad_bad, + c, reflink_p_front_pad_bad, "idx < front_pad (%llu < %u)", le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); fsck_err: @@ -256,11 +255,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, /* indirect extents */ -int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { - return bch2_bkey_ptrs_invalid(c, k, flags, err); + return bch2_bkey_ptrs_validate(c, k, flags); } void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, @@ -311,9 +309,8 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, /* indirect inline data */ -int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index e894f3a2c67a..51afe11d8ed6 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -4,41 +4,37 @@ enum bch_validate_flags; -int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_p_invalid, \ + .key_validate = bch2_reflink_p_validate, \ .val_to_text = bch2_reflink_p_to_text, \ .key_merge = bch2_reflink_p_merge, \ .trigger = bch2_trigger_reflink_p, \ .min_val_size = 16, \ }) -int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_v_invalid, \ + .key_validate = bch2_reflink_v_validate, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) -int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, @@ -47,7 +43,7 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ - .key_invalid = bch2_indirect_inline_data_invalid, \ + .key_validate = bch2_indirect_inline_data_validate, \ .val_to_text = bch2_indirect_inline_data_to_text, \ .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 96744b1a76f5..8b18a9b483a4 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -31,15 +31,14 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(t.v->root_snapshot)); } -int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_tree_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_tree_pos_bad, "bad pos"); fsck_err: return ret; @@ -225,55 +224,54 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(s.v->skip[2])); } -int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_snapshot s; u32 i, id; int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_pos_bad, "bad pos"); s = bkey_s_c_to_snapshot(k); id = le32_to_cpu(s.v->parent); - bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, - snapshot_parent_bad, + bkey_fsck_err_on(id && id <= k.k->p.offset, + c, snapshot_parent_bad, "bad parent node (%u <= %llu)", id, k.k->p.offset); - bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, - snapshot_children_not_normalized, + bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), + c, snapshot_children_not_normalized, "children not normalized"); - bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, - snapshot_child_duplicate, + bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], + c, snapshot_child_duplicate, "duplicate child nodes"); for (i = 0; i < 2; i++) { id = le32_to_cpu(s.v->children[i]); - bkey_fsck_err_on(id >= k.k->p.offset, c, err, - snapshot_child_bad, + bkey_fsck_err_on(id >= k.k->p.offset, + c, snapshot_child_bad, "bad child node (%u >= %llu)", id, k.k->p.offset); } if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, - snapshot_skiplist_not_normalized, + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), + c, snapshot_skiplist_not_normalized, "skiplist not normalized"); for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { id = le32_to_cpu(s.v->skip[i]); - bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, - snapshot_skiplist_bad, + bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), + c, snapshot_skiplist_bad, "bad skiplist node %u", id); } } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 31b0ee03e962..eb5ef64221d6 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -5,11 +5,11 @@ enum bch_validate_flags; void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_tree_invalid, \ + .key_validate = bch2_snapshot_tree_validate, \ .val_to_text = bch2_snapshot_tree_to_text, \ .min_val_size = 8, \ }) @@ -19,14 +19,13 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_invalid, \ + .key_validate = bch2_snapshot_validate, \ .val_to_text = bch2_snapshot_to_text, \ .trigger = bch2_mark_snapshot, \ .min_val_size = 24, \ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index f56720b55862..dbe834cb349f 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -207,23 +207,23 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ -int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); int ret = 0; bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, - subvol_pos_bad, + bkey_gt(k.k->p, SUBVOL_POS_MAX), + c, subvol_pos_bad, "invalid pos"); - bkey_fsck_err_on(!subvol.v->snapshot, c, err, - subvol_snapshot_bad, + bkey_fsck_err_on(!subvol.v->snapshot, + c, subvol_snapshot_bad, "invalid snapshot"); - bkey_fsck_err_on(!subvol.v->inode, c, err, - subvol_inode_bad, + bkey_fsck_err_on(!subvol.v->inode, + c, subvol_inode_bad, "invalid inode"); fsck_err: return ret; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index afa5e871efb2..a8299ba2cab2 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -10,15 +10,14 @@ enum bch_validate_flags; int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); -int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ - .key_invalid = bch2_subvolume_invalid, \ + .key_validate = bch2_subvolume_validate, \ .val_to_text = bch2_subvolume_to_text, \ .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index c11bf6dacc2c..f2b4c17a0307 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -70,17 +70,16 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len)); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, - xattr_val_size_too_small, + bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, + c, xattr_val_size_too_small, "value too small (%zu < %u)", bkey_val_u64s(k.k), val_u64s); @@ -88,17 +87,17 @@ int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len) + 4); - bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, - xattr_val_size_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, + c, xattr_val_size_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), val_u64s); - bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, - xattr_invalid_type, + bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), + c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, - xattr_name_invalid_chars, + bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: return ret; diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 1574b9eb4c85..c188a5ad64ce 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -6,12 +6,11 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ - .key_invalid = bch2_xattr_invalid, \ + .key_validate = bch2_xattr_validate, \ .val_to_text = bch2_xattr_to_text, \ .min_val_size = 8, \ }) From 5132b99bb62664c02bd6c0dd62ad3fedc75294d8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 02:35:10 -0400 Subject: [PATCH 0862/1052] bcachefs: Kill __bch2_accounting_mem_mod() The next patch will be adding a disk accounting counter type which is not kept in the in-memory eytzinger tree. As prep, fold __bch2_accounting_mem_mod() into bch2_accounting_mem_mod_locked() so that we can check for that counter type and bail out without calling bpos_to_disk_accounting_pos() twice. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 4 +- fs/bcachefs/disk_accounting.c | 4 +- fs/bcachefs/disk_accounting.h | 68 +++++++++++++++----------------- 3 files changed, 35 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 1a1e9d2036da..a0101d9c5d83 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -712,7 +712,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, a->k.version = journal_pos_to_bversion(&trans->journal_res, (u64 *) entry - (u64 *) trans->journal_entries); BUG_ON(bversion_zero(a->k.version)); - ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false); + ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false); if (ret) goto revert_fs_usage; } @@ -798,7 +798,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, false); + bch2_accounting_mem_mod_locked(trans, a.c, false, false); bch2_accounting_neg(a); } percpu_up_read(&c->mark_lock); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index f059cbffdf23..e57b40623cd9 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -566,7 +566,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false); + bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -595,7 +595,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return 0; percpu_down_read(&c->mark_lock); - int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false); + int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true); percpu_up_read(&c->mark_lock); if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) && diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index b92f8c2e3054..653090667aaa 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -106,8 +106,37 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r) int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); void bch2_accounting_mem_gc(struct bch_fs *); -static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) +/* + * Update in memory counters so they match the btree update we're doing; called + * from transaction commit path + */ +static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read) { + struct bch_fs *c = trans->c; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); + + if (!gc && !read) { + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + if (ca) { + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); + } + rcu_read_unlock(); + break; + } + } + struct bch_accounting_mem *acc = &c->accounting; unsigned idx; @@ -129,45 +158,10 @@ static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_ac return 0; } -/* - * Update in memory counters so they match the btree update we're doing; called - * from transaction commit path - */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) -{ - struct bch_fs *c = trans->c; - - if (!gc) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); - if (ca) { - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); - } - rcu_read_unlock(); - break; - } - } - - return __bch2_accounting_mem_mod(c, a, gc); -} - static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false); percpu_up_read(&trans->c->mark_lock); return ret; } From 58474f76a770bcc79d4b2d7232e4d6650e732b50 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 02:27:36 -0400 Subject: [PATCH 0863/1052] bcachefs: bcachefs_metadata_version_disk_accounting_inum This adds another disk accounting counter to track usage per inode number (any snapshot ID). This will be used for a couple things: - It'll give us a way to tell the user how much space a given file ista consuming in all snapshots; i.e. how much extra space it's consuming due to snapshot versioning. - It counts number of extents and total size of extents (both in btree keyspace sectors and actual disk usage), meaning it gives us average extent size: that is, it'll let us cheaply find fragmented files that should be defragmented. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/buckets.c | 14 ++++++++++++++ fs/bcachefs/disk_accounting.c | 3 +++ fs/bcachefs/disk_accounting.h | 3 +++ fs/bcachefs/disk_accounting_format.h | 8 +++++++- fs/bcachefs/sb-downgrade.c | 5 ++++- 6 files changed, 33 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b25f86356728..c75f2e0f32bb 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -676,7 +676,8 @@ struct bch_sb_field_ext { x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ x(disk_accounting_v2, BCH_VERSION(1, 9)) \ - x(disk_accounting_v3, BCH_VERSION(1, 10)) + x(disk_accounting_v3, BCH_VERSION(1, 10)) \ + x(disk_accounting_inum, BCH_VERSION(1, 11)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 9f7004e941ce..b69ef4b3de6e 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -810,6 +810,20 @@ static int __trigger_extent(struct btree_trans *trans, ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc); if (ret) return ret; + } else { + bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct disk_accounting_pos acc_inum_key = { + .type = BCH_DISK_ACCOUNTING_inum, + .inum.inum = k.k->p.inode, + }; + s64 v[3] = { + insert ? 1 : -1, + insert ? k.k->size : -((s64) k.k->size), + replicas_sectors, + }; + ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + if (ret) + return ret; } if (bch2_bkey_rebalance_opts(k)) { diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index e57b40623cd9..e972e2bca546 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -768,6 +768,9 @@ void bch2_verify_accounting_clean(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) continue; + if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + continue; + bch2_accounting_mem_read(c, k.k->p, v, nr); if (memcmp(a.v->d, v, nr * sizeof(u64))) { diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 653090667aaa..f29fd0dd9581 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -116,6 +116,9 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, a.k->p); + if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + return 0; + if (!gc && !read) { switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index a93cf26ff4a9..7b6e6c97e6aa 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -103,7 +103,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(compression, 4) \ x(snapshot, 5) \ x(btree, 6) \ - x(rebalance_work, 7) + x(rebalance_work, 7) \ + x(inum, 8) enum disk_accounting_type { #define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, @@ -136,6 +137,10 @@ struct bch_acct_btree { __u32 id; } __packed; +struct bch_acct_inum { + __u64 inum; +} __packed; + struct bch_acct_rebalance_work { }; @@ -152,6 +157,7 @@ struct disk_accounting_pos { struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; struct bch_acct_rebalance_work rebalance_work; + struct bch_acct_inum inum; } __packed; } __packed; struct bpos _pad; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 9f82d497d9e0..650a1f77ca40 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -72,7 +72,10 @@ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(disk_accounting_inum, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ From 63de936b513f7a9ce559194d3269ac291f4f4662 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 21 Jul 2024 17:38:40 +0200 Subject: [PATCH 0864/1052] media: atomisp: Fix streaming no longer working on BYT / ISP2400 devices Commit a0821ca14bb8 ("media: atomisp: Remove test pattern generator (TPG) support") broke BYT support because it removed a seemingly unused field from struct sh_css_sp_config and a seemingly unused value from enum ia_css_input_mode. But these are part of the ABI between the kernel and firmware on ISP2400 and this part of the TPG support removal changes broke ISP2400 support. ISP2401 support was not affected because on ISP2401 only a part of struct sh_css_sp_config is used. Restore the removed field and enum value to fix this. Fixes: a0821ca14bb8 ("media: atomisp: Remove test pattern generator (TPG) support") Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Signed-off-by: Hans Verkuil --- .../media/atomisp/pci/ia_css_stream_public.h | 8 ++++++-- .../media/atomisp/pci/sh_css_internal.h | 19 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/ia_css_stream_public.h b/drivers/staging/media/atomisp/pci/ia_css_stream_public.h index 961c61288083..aad860e54d3a 100644 --- a/drivers/staging/media/atomisp/pci/ia_css_stream_public.h +++ b/drivers/staging/media/atomisp/pci/ia_css_stream_public.h @@ -27,12 +27,16 @@ #include "ia_css_prbs.h" #include "ia_css_input_port.h" -/* Input modes, these enumerate all supported input modes. - * Note that not all ISP modes support all input modes. +/* + * Input modes, these enumerate all supported input modes. + * This enum is part of the atomisp firmware ABI and must + * NOT be changed! + * Note that not all ISP modes support all input modes. */ enum ia_css_input_mode { IA_CSS_INPUT_MODE_SENSOR, /** data from sensor */ IA_CSS_INPUT_MODE_FIFO, /** data from input-fifo */ + IA_CSS_INPUT_MODE_TPG, /** data from test-pattern generator */ IA_CSS_INPUT_MODE_PRBS, /** data from pseudo-random bit stream */ IA_CSS_INPUT_MODE_MEMORY, /** data from a frame in memory */ IA_CSS_INPUT_MODE_BUFFERED_SENSOR /** data is sent through mipi buffer */ diff --git a/drivers/staging/media/atomisp/pci/sh_css_internal.h b/drivers/staging/media/atomisp/pci/sh_css_internal.h index a2d972ea3fa0..959e7f549641 100644 --- a/drivers/staging/media/atomisp/pci/sh_css_internal.h +++ b/drivers/staging/media/atomisp/pci/sh_css_internal.h @@ -344,7 +344,14 @@ struct sh_css_sp_input_formatter_set { #define IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT (3) -/* SP configuration information */ +/* + * SP configuration information + * + * This struct is part of the atomisp firmware ABI and is directly copied + * to ISP DRAM by sh_css_store_sp_group_to_ddr() + * + * Do NOT change this struct's layout or remove seemingly unused fields! + */ struct sh_css_sp_config { u8 no_isp_sync; /* Signal host immediately after start */ u8 enable_raw_pool_locking; /** Enable Raw Buffer Locking for HALv3 Support */ @@ -354,6 +361,10 @@ struct sh_css_sp_config { host (true) or when they are passed to the preview/video pipe (false). */ + /* + * Note the fields below are only used on the ISP2400 not on the ISP2401, + * sh_css_store_sp_group_to_ddr() skip copying these when run on the ISP2401. + */ struct { u8 a_changed; u8 b_changed; @@ -363,11 +374,13 @@ struct sh_css_sp_config { } input_formatter; sync_generator_cfg_t sync_gen; + tpg_cfg_t tpg; prbs_cfg_t prbs; input_system_cfg_t input_circuit; u8 input_circuit_cfg_changed; - u32 mipi_sizes_for_check[N_CSI_PORTS][IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT]; - u8 enable_isys_event_queue; + u32 mipi_sizes_for_check[N_CSI_PORTS][IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT]; + /* These last 2 fields are used on both the ISP2400 and the ISP2401 */ + u8 enable_isys_event_queue; u8 disable_cont_vf; }; From a2cbb1603943281a604f5adc48079a148db5cb0d Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 8 Aug 2024 16:06:40 -0700 Subject: [PATCH 0865/1052] tcp: Update window clamping condition This patch is based on the discussions between Neal Cardwell and Eric Dumazet in the link https://lore.kernel.org/netdev/20240726204105.1466841-1-quic_subashab@quicinc.com/ It was correctly pointed out that tp->window_clamp would not be updated in cases where net.ipv4.tcp_moderate_rcvbuf=0 or if (copied <= tp->rcvq_space.space). While it is expected for most setups to leave the sysctl enabled, the latter condition may not end up hitting depending on the TCP receive queue size and the pattern of arriving data. The updated check should be hit only on initial MSS update from TCP_MIN_MSS to measured MSS value and subsequently if there was an update to a larger value. Fixes: 05f76b2d634e ("tcp: Adjust clamping window for applications specifying SO_RCVBUF") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e2b9583ed96a..e37488d3453f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -238,9 +238,14 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ if (unlikely(len != icsk->icsk_ack.rcv_mss)) { u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + u8 old_ratio = tcp_sk(sk)->scaling_ratio; do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; + + if (old_ratio != tcp_sk(sk)->scaling_ratio) + WRITE_ONCE(tcp_sk(sk)->window_clamp, + tcp_win_from_space(sk, sk->sk_rcvbuf)); } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); @@ -754,7 +759,8 @@ void tcp_rcv_space_adjust(struct sock *sk) * */ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)) { + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { u64 rcvwin, grow; int rcvbuf; @@ -770,22 +776,12 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); - } - } else { - /* Make the window clamp follow along while being bounded - * by SO_RCVBUF. - */ - int clamp = tcp_win_from_space(sk, min(rcvbuf, sk->sk_rcvbuf)); - - if (clamp > tp->window_clamp) - WRITE_ONCE(tp->window_clamp, clamp); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); } } tp->rcvq_space.space = copied; From c286f204ce6ba7b48e3dcba53eda7df8eaa64dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Jos=C3=A9=20Arboleda?= Date: Tue, 13 Aug 2024 11:10:53 -0500 Subject: [PATCH 0866/1052] ALSA: usb-audio: Support Yamaha P-125 quirk entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a USB quirk for the Yamaha P-125 digital piano. Signed-off-by: Juan José Arboleda Cc: Link: https://patch.msgid.link/20240813161053.70256-1-soyjuanarbol@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/quirks-table.h | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index f13a8d63a019..aaa6a515d0f8 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -273,6 +273,7 @@ YAMAHA_DEVICE(0x105a, NULL), YAMAHA_DEVICE(0x105b, NULL), YAMAHA_DEVICE(0x105c, NULL), YAMAHA_DEVICE(0x105d, NULL), +YAMAHA_DEVICE(0x1718, "P-125"), { USB_DEVICE(0x0499, 0x1503), .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { From fa0db8e568787c665384430eaf2221b299b85367 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 13 Aug 2024 15:19:01 +0200 Subject: [PATCH 0867/1052] Revert "ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error" This reverts commit 28ab9769117ca944cb6eb537af5599aa436287a4. Sense data can be in either fixed format or descriptor format. SAT-6 revision 1, "10.4.6 Control mode page", defines the D_SENSE bit: "The SATL shall support this bit as defined in SPC-5 with the following exception: if the D_ SENSE bit is set to zero (i.e., fixed format sense data), then the SATL should return fixed format sense data for ATA PASS-THROUGH commands." The libata SATL has always kept D_SENSE set to zero by default. (It is however possible to change the value using a MODE SELECT SG_IO command.) Failed ATA PASS-THROUGH commands correctly respected the D_SENSE bit, however, successful ATA PASS-THROUGH commands incorrectly returned the sense data in descriptor format (regardless of the D_SENSE bit). Commit 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error") fixed this bug for successful ATA PASS-THROUGH commands. However, after commit 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error"), there were bug reports that hdparm, hddtemp, and udisks were no longer working as expected. These applications incorrectly assume the returned sense data is in descriptor format, without even looking at the RESPONSE CODE field in the returned sense data (to see which format the returned sense data is in). Considering that there will be broken versions of these applications around roughly forever, we are stuck with being bug compatible with older kernels. Cc: stable@vger.kernel.org # 4.19+ Reported-by: Stephan Eisvogel Reported-by: Christian Heusel Closes: https://lore.kernel.org/linux-ide/0bf3f2f0-0fc6-4ba5-a420-c0874ef82d64@heusel.eu/ Fixes: 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error") Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240813131900.1285842-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index d6f5e25e1ed8..473e00a58a8b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -951,8 +951,19 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc) &sense_key, &asc, &ascq); ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq); } else { - /* ATA PASS-THROUGH INFORMATION AVAILABLE */ - ata_scsi_set_sense(qc->dev, cmd, RECOVERED_ERROR, 0, 0x1D); + /* + * ATA PASS-THROUGH INFORMATION AVAILABLE + * + * Note: we are supposed to call ata_scsi_set_sense(), which + * respects the D_SENSE bit, instead of unconditionally + * generating the sense data in descriptor format. However, + * because hdparm, hddtemp, and udisks incorrectly assume sense + * data in descriptor format, without even looking at the + * RESPONSE CODE field in the returned sense data (to see which + * format the returned sense data is in), we are stuck with + * being bug compatible with older kernels. + */ + scsi_build_sense(cmd, 1, RECOVERED_ERROR, 0, 0x1D); } } From 829e2a23121fb36ee30ea5145c2a85199f68e2c8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 14 Aug 2024 12:04:59 +0200 Subject: [PATCH 0868/1052] ALSA: hda/tas2781: Use correct endian conversion The data conversion is done rather by a wrong function. We convert to BE32, not from BE32. Although the end result must be same, this was complained by the compiler. Fix the code again and align with another similar function tas2563_apply_calib() that does already right. Fixes: 3beddef84d90 ("ALSA: hda/tas2781: fix wrong calibrated data order") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408141630.DiDUB8Z4-lkp@intel.com/ Link: https://patch.msgid.link/20240814100500.1944-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 7dbfc92d9d55..89d8235537cd 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -527,8 +527,8 @@ static void tas2781_apply_calib(struct tasdevice_priv *tas_priv) for (i = 0; i < tas_priv->ndev; i++) { for (j = 0; j < CALIB_MAX; j++) { - data = get_unaligned_be32( - &tas_priv->cali_data.data[offset]); + data = cpu_to_be32( + *(uint32_t *)&tas_priv->cali_data.data[offset]); rc = tasdevice_dev_bulk_write(tas_priv, i, TASDEVICE_REG(0, page_array[j], rgno_array[j]), (unsigned char *)&data, 4); From 73c34b0b85d46bf9c2c0b367aeaffa1e2481b136 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jul 2024 13:44:33 -0700 Subject: [PATCH 0869/1052] xfs: attr forks require attr, not attr2 It turns out that I misunderstood the difference between the attr and attr2 feature bits. "attr" means that at some point an attr fork was created somewhere in the filesystem. "attr2" means that inodes have variable-sized forks, but says nothing about whether or not there actually /are/ attr forks in the system. If we have an attr fork, we only need to check that attr is set. Fixes: 99d9d8d05da26 ("xfs: scrub inode block mappings") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/bmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 24a15bf784f1..5ab2ac53c920 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -938,7 +938,13 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!xfs_has_attr(mp) && !xfs_has_attr2(mp)) + /* + * "attr" means that an attr fork was created at some point in + * the life of this filesystem. "attr2" means that inodes have + * variable-sized data/attr fork areas. Hence we only check + * attr here. + */ + if (!xfs_has_attr(mp)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); break; default: From 04d6dbb55301a29aeb9a197e6b0012cdc265f1e4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 4 Aug 2024 14:39:34 -0700 Subject: [PATCH 0870/1052] xfs: revert AIL TASK_KILLABLE threshold In commit 9adf40249e6c, we changed the behavior of the AIL thread to set its own task state to KILLABLE whenever the timeout value is nonzero. Unfortunately, this missed the fact that xfsaild_push will return 50ms (aka a longish sleep) when we reach the push target or the AIL becomes empty, so xfsaild goes to sleep for a long period of time in uninterruptible D state. This results in artificially high load averages because KILLABLE processes are UNINTERRUPTIBLE, which contributes to load average even though the AIL is asleep waiting for someone to interrupt it. It's not blocked on IOs or anything, but people scrap ps for processes that look like they're stuck in D state, so restore the previous threshold. Fixes: 9adf40249e6c ("xfs: AIL doesn't need manual pushing") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_trans_ail.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0fafcc9f3dbe..8ede9d099d1f 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -644,7 +644,12 @@ xfsaild( set_freezable(); while (1) { - if (tout) + /* + * Long waits of 50ms or more occur when we've run out of items + * to push, so we only want uninterruptible state if we're + * actually blocked on something. + */ + if (tout && tout <= 20) set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); From 8d16762047c627073955b7ed171a36addaf7b1ff Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 4 Aug 2024 14:39:57 -0700 Subject: [PATCH 0871/1052] xfs: conditionally allow FS_XFLAG_REALTIME changes if S_DAX is set If a file has the S_DAX flag (aka fsdax access mode) set, we cannot allow users to change the realtime flag unless the datadev and rtdev both support fsdax access modes. Even if there are no extents allocated to the file, the setattr thread could be racing with another thread that has already started down the write code paths. Fixes: ba23cba9b3bdc ("fs: allow per-device dax status checking for filesystems") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ioctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4e933db75b12..6b13666d4e96 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -483,6 +483,17 @@ xfs_ioctl_setattr_xflags( /* Can't change realtime flag if any extents are allocated. */ if (ip->i_df.if_nextents || ip->i_delayed_blks) return -EINVAL; + + /* + * If S_DAX is enabled on this file, we can only switch the + * device if both support fsdax. We can't update S_DAX because + * there might be other threads walking down the access paths. + */ + if (IS_DAX(VFS_I(ip)) && + (mp->m_ddev_targp->bt_daxdev == NULL || + (mp->m_rtdev_targp && + mp->m_rtdev_targp->bt_daxdev == NULL))) + return -EINVAL; } if (rtflag) { From 66155de93bcf4f2967e602a4b3bf7ebe58f34b11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:02:58 -0700 Subject: [PATCH 0872/1052] KVM: x86: Disallow read-only memslots for SEV-ES and SEV-SNP (and TDX) Disallow read-only memslots for SEV-{ES,SNP} VM types, as KVM can't directly emulate instructions for ES/SNP, and instead the guest must explicitly request emulation. Unless the guest explicitly requests emulation without accessing memory, ES/SNP relies on KVM creating an MMIO SPTE, with the subsequent #NPF being reflected into the guest as a #VC. But for read-only memslots, KVM deliberately doesn't create MMIO SPTEs, because except for ES/SNP, doing so requires setting reserved bits in the SPTE, i.e. the SPTE can't be readable while also generating a #VC on writes. Because KVM never creates MMIO SPTEs and jumps directly to emulation, the guest never gets a #VC. And since KVM simply resumes the guest if ES/SNP guests trigger emulation, KVM effectively puts the vCPU into an infinite #NPF loop if the vCPU attempts to write read-only memory. Disallow read-only memory for all VMs with protected state, i.e. for upcoming TDX VMs as well as ES/SNP VMs. For TDX, it's actually possible to support read-only memory, as TDX uses EPT Violation #VE to reflect the fault into the guest, e.g. KVM could configure read-only SPTEs with RX protections and SUPPRESS_VE=0. But there is no strong use case for supporting read-only memslots on TDX, e.g. the main historical usage is to emulate option ROMs, but TDX disallows executing from shared memory. And if someone comes along with a legitimate, strong use case, the restriction can always be lifted for TDX. Don't bother trying to retroactively apply the restriction to SEV-ES VMs that are created as type KVM_X86_DEFAULT_VM. Read-only memslots can't possibly work for SEV-ES, i.e. disallowing such memslots is really just means reporting an error to userspace instead of silently hanging vCPUs. Trying to deal with the ordering between KVM_SEV_INIT and memslot creation isn't worth the marginal benefit it would provide userspace. Fixes: 26c44aa9e076 ("KVM: SEV: define VM types for SEV and SEV-ES") Fixes: 1dfe571c12cf ("KVM: SEV: Add initial SEV-SNP support") Cc: Peter Gonda Cc: Michael Roth Cc: Vishal Annapurve Cc: Ackerly Tng Signed-off-by: Sean Christopherson Message-ID: <20240809190319.1710470-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 5 ++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 94e7b5a4fafe..4a68cb3eba78 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2192,6 +2192,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, #define kvm_arch_has_private_mem(kvm) false #endif +#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) + static inline u16 kvm_read_ldt(void) { u16 ldt; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 79a6b1a63027..b23c6d48392f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -715,6 +715,13 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) } #endif +#ifndef kvm_arch_has_readonly_mem +static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) +{ + return IS_ENABLED(CONFIG_HAVE_KVM_READONLY_MEM); +} +#endif + struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 92901656a0d4..cb2b78e92910 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1578,15 +1578,14 @@ static int check_memory_region_flags(struct kvm *kvm, if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -#ifdef CONFIG_HAVE_KVM_READONLY_MEM /* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory. */ - if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) + if (kvm_arch_has_readonly_mem(kvm) && + !(mem->flags & KVM_MEM_GUEST_MEMFD)) valid_flags |= KVM_MEM_READONLY; -#endif if (mem->flags & ~valid_flags) return -EINVAL; From f94511df53bb792e505c98662971434c7995388a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 7 Aug 2024 11:37:31 +0100 Subject: [PATCH 0873/1052] arm64: uaccess: correct thinko in __get_mem_asm() In the CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y version of __get_mem_asm(), we incorrectly use _ASM_EXTABLE_##type##ACCESS_ERR() such that upon a fault the extable fixup handler writes -EFAULT into "%w0", which is the register containing 'x' (the result of the load). This was a thinko in commit: 86a6a68febfcf57b ("arm64: start using 'asm goto' for get_user() when available") Prior to that commit _ASM_EXTABLE_##type##ACCESS_ERR_ZERO() was used such that the extable fixup handler wrote -EFAULT into "%w0" (the register containing 'err'), and zero into "%w1" (the register containing 'x'). When the 'err' variable was removed, the extable entry was updated incorrectly. Writing -EFAULT to the value register is unnecessary but benign: * We never want -EFAULT in the value register, and previously this would have been zeroed in the extable fixup handler. * In __get_user_error() the value is overwritten with zero explicitly in the error path. * The asm goto outputs cannot be used when the goto label is taken, as older compilers (e.g. clang < 16.0.0) do not guarantee that asm goto outputs are usable in this path and may use a stale value rather than the value in an output register. Consequently, zeroing in the extable fixup handler is insufficient to ensure callers see zero in the error path. * The expected usage of unsafe_get_user() and get_kernel_nofault() requires that the value is not consumed in the error path. Some versions of GCC would mis-compile asm goto with outputs, and erroneously omit subsequent assignments, breaking the error path handling in __get_user_error(). This was discussed at: https://lore.kernel.org/lkml/ZpfxLrJAOF2YNqCk@J2N7QTR9R3.cambridge.arm.com/ ... and was fixed by removing support for asm goto with outputs on those broken compilers in commit: f2f6a8e887172503 ("init/Kconfig: remove CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND") With that out of the way, we can safely replace the usage of _ASM_EXTABLE_##type##ACCESS_ERR() with _ASM_EXTABLE_##type##ACCESS(), leaving the value register unchanged in the case a fault is taken, as was originally intended. This matches other architectures and matches our __put_mem_asm(). Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20240807103731.2498893-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 28f665e0975a..1aa4ecb73429 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -188,7 +188,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) #define __get_mem_asm(load, reg, x, addr, label, type) \ asm_goto_output( \ "1: " load " " reg "0, [%1]\n" \ - _ASM_EXTABLE_##type##ACCESS_ERR(1b, %l2, %w0) \ + _ASM_EXTABLE_##type##ACCESS(1b, %l2) \ : "=r" (x) \ : "r" (addr) : : label) #else From a21dcf0ea8566ebbe011c79d6ed08cdfea771de3 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Mon, 5 Aug 2024 11:30:24 +0800 Subject: [PATCH 0874/1052] arm64: ACPI: NUMA: initialize all values of acpi_early_node_map to NUMA_NO_NODE Currently, only acpi_early_node_map[0] was initialized to NUMA_NO_NODE. To ensure all the values were properly initialized, switch to initialize all of them to NUMA_NO_NODE. Fixes: e18962491696 ("arm64: numa: rework ACPI NUMA initialization") Cc: # 4.19.x Reported-by: Andrew Jones Suggested-by: Andrew Jones Signed-off-by: Haibo Xu Reviewed-by: Anshuman Khandual Reviewed-by: Sunil V L Reviewed-by: Andrew Jones Acked-by: Catalin Marinas Acked-by: Lorenzo Pieralisi Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/853d7f74aa243f6f5999e203246f0d1ae92d2b61.1722828421.git.haibo1.xu@intel.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/acpi_numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index 0c036a9a3c33..2465f291c7e1 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -27,7 +27,7 @@ #include -static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; int __init acpi_numa_get_nid(unsigned int cpu) { From 1c0e5881691a787a9399a99bff4d56ead6e75e91 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Wed, 14 Aug 2024 10:31:13 +0200 Subject: [PATCH 0875/1052] KVM: SEV: uapi: fix typo in SEV_RET_INVALID_CONFIG "INVALID" is misspelt in "SEV_RET_INAVLID_CONFIG". Since this is part of the UAPI, keep the current definition and add a new one with the fix. Fix-suggested-by: Marc Zyngier Signed-off-by: Amit Shah Message-ID: <20240814083113.21622-1-amit@kernel.org> Signed-off-by: Paolo Bonzini --- include/uapi/linux/psp-sev.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 2289b7c76c59..832c15d9155b 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -51,6 +51,7 @@ typedef enum { SEV_RET_INVALID_PLATFORM_STATE, SEV_RET_INVALID_GUEST_STATE, SEV_RET_INAVLID_CONFIG, + SEV_RET_INVALID_CONFIG = SEV_RET_INAVLID_CONFIG, SEV_RET_INVALID_LEN, SEV_RET_ALREADY_OWNED, SEV_RET_INVALID_CERTIFICATE, From 71833e79a42178d8a50b5081c98c78ace9325628 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 14 Aug 2024 13:16:49 +0100 Subject: [PATCH 0876/1052] i2c: Use IS_REACHABLE() for substituting empty ACPI functions Replace IS_ENABLED() with IS_REACHABLE() to substitute empty stubs for: i2c_acpi_get_i2c_resource() i2c_acpi_client_count() i2c_acpi_find_bus_speed() i2c_acpi_new_device_by_fwnode() i2c_adapter *i2c_acpi_find_adapter_by_handle() i2c_acpi_waive_d0_probe() commit f17c06c6608a ("i2c: Fix conditional for substituting empty ACPI functions") partially fixed this conditional to depend on CONFIG_I2C, but used IS_ENABLED(), which is wrong since CONFIG_I2C is tristate. CONFIG_ACPI is boolean but let's also change it to use IS_REACHABLE() to future-proof it against becoming tristate. Somehow despite testing various combinations of CONFIG_I2C and CONFIG_ACPI we missed the combination CONFIG_I2C=m, CONFIG_ACPI=y. Signed-off-by: Richard Fitzgerald Fixes: f17c06c6608a ("i2c: Fix conditional for substituting empty ACPI functions") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408141333.gYnaitcV-lkp@intel.com/ Reviewed-by: Takashi Iwai Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 7eedd0c662da..377def497298 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1066,7 +1066,7 @@ static inline int of_i2c_get_board_info(struct device *dev, struct acpi_resource; struct acpi_resource_i2c_serialbus; -#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_I2C) +#if IS_REACHABLE(CONFIG_ACPI) && IS_REACHABLE(CONFIG_I2C) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); From 3cd740b985963f874a1a094f1969e998b9d05554 Mon Sep 17 00:00:00 2001 From: Tom Hughes Date: Tue, 6 Aug 2024 12:40:52 +0100 Subject: [PATCH 0877/1052] netfilter: allow ipv6 fragments to arrive on different devices Commit 264640fc2c5f4 ("ipv6: distinguish frag queues by device for multicast and link-local packets") modified the ipv6 fragment reassembly logic to distinguish frag queues by device for multicast and link-local packets but in fact only the main reassembly code limits the use of the device to those address types and the netfilter reassembly code uses the device for all packets. This means that if fragments of a packet arrive on different interfaces then netfilter will fail to reassemble them and the fragments will be expired without going any further through the filters. Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Tom Hughes Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 6f0844c9315d..4120e67a8ce6 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -154,6 +154,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; + q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; From 61119394631f219e23ce98bcc3eb993a64a8ea64 Mon Sep 17 00:00:00 2001 From: Celeste Liu Date: Thu, 27 Jun 2024 22:23:39 +0800 Subject: [PATCH 0878/1052] riscv: entry: always initialize regs->a0 to -ENOSYS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise when the tracer changes syscall number to -1, the kernel fails to initialize a0 with -ENOSYS and subsequently fails to return the error code of the failed syscall to userspace. For example, it will break strace syscall tampering. Fixes: 52449c17bdd1 ("riscv: entry: set a0 = -ENOSYS only when syscall != -1") Reported-by: "Dmitry V. Levin" Reviewed-by: Björn Töpel Cc: stable@vger.kernel.org Signed-off-by: Celeste Liu Link: https://lore.kernel.org/r/20240627142338.5114-2-CoelacanthusHex@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 05a16b1f0aee..51ebfd23e007 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -319,6 +319,7 @@ void do_trap_ecall_u(struct pt_regs *regs) regs->epc += 4; regs->orig_a0 = regs->a0; + regs->a0 = -ENOSYS; riscv_v_vstate_discard(regs); @@ -328,8 +329,7 @@ void do_trap_ecall_u(struct pt_regs *regs) if (syscall >= 0 && syscall < NR_syscalls) syscall_handler(regs, syscall); - else if (syscall != -1) - regs->a0 = -ENOSYS; + /* * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), * so the maximum stack offset is 1k bytes (10 bits). From 57d76bc51fd80824bcc0c84a5b5ec944f1b51edd Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 8 May 2024 21:19:17 +0200 Subject: [PATCH 0879/1052] riscv: change XIP's kernel_map.size to be size of the entire kernel With XIP kernel, kernel_map.size is set to be only the size of data part of the kernel. This is inconsistent with "normal" kernel, who sets it to be the size of the entire kernel. More importantly, XIP kernel fails to boot if CONFIG_DEBUG_VIRTUAL is enabled, because there are checks on virtual addresses with the assumption that kernel_map.size is the size of the entire kernel (these checks are in arch/riscv/mm/physaddr.c). Change XIP's kernel_map.size to be the size of the entire kernel. Signed-off-by: Nam Cao Cc: # v6.1+ Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240508191917.2892064-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8b698d9609e7..eb0649a61b4c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -927,7 +927,7 @@ static void __init create_kernel_page_table(pgd_t *pgdir, PMD_SIZE, PAGE_KERNEL_EXEC); /* Map the data in RAM */ - end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size; + end_va = kernel_map.virt_addr + kernel_map.size; for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), @@ -1096,7 +1096,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) phys_ram_base = CONFIG_PHYS_RAM_BASE; kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; - kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); + kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; #else From a445699879f989f6700df81f497b70bf94cc6163 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Mon, 5 Aug 2024 11:30:23 +0800 Subject: [PATCH 0880/1052] RISC-V: ACPI: NUMA: initialize all values of acpi_early_node_map to NUMA_NO_NODE Currently, only acpi_early_node_map[0] was initialized to NUMA_NO_NODE. To ensure all the values were properly initialized, switch to initialize all of them to NUMA_NO_NODE. Fixes: eabd9db64ea8 ("ACPI: RISCV: Add NUMA support based on SRAT and SLIT") Reported-by: Andrew Jones Suggested-by: Andrew Jones Signed-off-by: Haibo Xu Reviewed-by: Sunil V L Reviewed-by: Andrew Jones Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/0d362a8ae50558b95685da4c821b2ae9e8cf78be.1722828421.git.haibo1.xu@intel.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/acpi_numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/acpi_numa.c b/arch/riscv/kernel/acpi_numa.c index 0231482d6946..ff95aeebee3e 100644 --- a/arch/riscv/kernel/acpi_numa.c +++ b/arch/riscv/kernel/acpi_numa.c @@ -28,7 +28,7 @@ #include -static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; int __init acpi_numa_get_nid(unsigned int cpu) { From c42e2f076769c9c1bc5f3f0aa1c2032558e76647 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 9 Aug 2024 14:44:43 -0700 Subject: [PATCH 0881/1052] RISC-V: hwprobe: Add MISALIGNED_PERF key RISCV_HWPROBE_KEY_CPUPERF_0 was mistakenly flagged as a bitmask in hwprobe_key_is_bitmask(), when in reality it was an enum value. This causes problems when used in conjunction with RISCV_HWPROBE_WHICH_CPUS, since SLOW, FAST, and EMULATED have values whose bits overlap with each other. If the caller asked for the set of CPUs that was SLOW or EMULATED, the returned set would also include CPUs that were FAST. Introduce a new hwprobe key, RISCV_HWPROBE_KEY_MISALIGNED_PERF, which returns the same values in response to a direct query (with no flags), but is properly handled as an enumerated value. As a result, SLOW, FAST, and EMULATED are all correctly treated as distinct values under the new key when queried with the WHICH_CPUS flag. Leave the old key in place to avoid disturbing applications which may have already come to rely on the key, with or without its broken behavior with respect to the WHICH_CPUS flag. Fixes: e178bf146e4b ("RISC-V: hwprobe: Introduce which-cpus flag") Signed-off-by: Evan Green Reviewed-by: Charlie Jenkins Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20240809214444.3257596-2-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/hwprobe.rst | 20 +++++++++++++------- arch/riscv/include/asm/hwprobe.h | 2 +- arch/riscv/include/uapi/asm/hwprobe.h | 1 + arch/riscv/kernel/sys_hwprobe.c | 1 + 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index 3db60a0911df..a994eed75bde 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -239,8 +239,13 @@ The following keys are defined: ratified in commit 98918c844281 ("Merge pull request #1217 from riscv/zawrs") of riscv-isa-manual. -* :c:macro:`RISCV_HWPROBE_KEY_CPUPERF_0`: A bitmask that contains performance - information about the selected set of processors. +* :c:macro:`RISCV_HWPROBE_KEY_CPUPERF_0`: Deprecated. Returns similar values to + :c:macro:`RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF`, but the key was + mistakenly classified as a bitmask rather than a value. + +* :c:macro:`RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF`: An enum value describing + the performance of misaligned scalar native word accesses on the selected set + of processors. * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNKNOWN`: The performance of misaligned accesses is unknown. @@ -249,12 +254,13 @@ The following keys are defined: emulated via software, either in or below the kernel. These accesses are always extremely slow. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned accesses are slower - than equivalent byte accesses. Misaligned accesses may be supported - directly in hardware, or trapped and emulated by software. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned native word + sized accesses are slower than the equivalent quantity of byte accesses. + Misaligned accesses may be supported directly in hardware, or trapped and + emulated by software. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned accesses are faster - than equivalent byte accesses. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned native word + sized accesses are faster than the equivalent quantity of byte accesses. * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNSUPPORTED`: Misaligned accesses are not supported at all and will generate a misaligned address fault. diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index ef01c182af2b..ffb9484531af 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 8 +#define RISCV_HWPROBE_MAX_KEY 9 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index b706c8e47b02..635753084275 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -82,6 +82,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 #define RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS 7 #define RISCV_HWPROBE_KEY_TIME_CSR_FREQ 8 +#define RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF 9 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 8d1b5c35d2a7..2d0f4f6a32c3 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -225,6 +225,7 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, break; case RISCV_HWPROBE_KEY_CPUPERF_0: + case RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF: pair->value = hwprobe_misaligned(cpus); break; From 1f5288874de776412041022607513ffac74ae1a6 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 9 Aug 2024 14:44:44 -0700 Subject: [PATCH 0882/1052] RISC-V: hwprobe: Add SCALAR to misaligned perf defines In preparation for misaligned vector performance hwprobe keys, rename the hwprobe key values associated with misaligned scalar accesses to include the term SCALAR. Leave the old defines in place to maintain source compatibility. This change is intended to be a functional no-op. Signed-off-by: Evan Green Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240809214444.3257596-3-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/hwprobe.rst | 28 ++++++++++++---------- arch/riscv/include/uapi/asm/hwprobe.h | 5 ++++ arch/riscv/kernel/sys_hwprobe.c | 10 ++++---- arch/riscv/kernel/traps_misaligned.c | 6 ++--- arch/riscv/kernel/unaligned_access_speed.c | 12 +++++----- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index a994eed75bde..85b709257918 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -247,23 +247,25 @@ The following keys are defined: the performance of misaligned scalar native word accesses on the selected set of processors. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNKNOWN`: The performance of misaligned - accesses is unknown. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN`: The performance of + misaligned scalar accesses is unknown. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_EMULATED`: Misaligned accesses are - emulated via software, either in or below the kernel. These accesses are - always extremely slow. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED`: Misaligned scalar + accesses are emulated via software, either in or below the kernel. These + accesses are always extremely slow. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned native word - sized accesses are slower than the equivalent quantity of byte accesses. - Misaligned accesses may be supported directly in hardware, or trapped and - emulated by software. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW`: Misaligned scalar native + word sized accesses are slower than the equivalent quantity of byte + accesses. Misaligned accesses may be supported directly in hardware, or + trapped and emulated by software. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned native word - sized accesses are faster than the equivalent quantity of byte accesses. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_FAST`: Misaligned scalar native + word sized accesses are faster than the equivalent quantity of byte + accesses. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNSUPPORTED`: Misaligned accesses are - not supported at all and will generate a misaligned address fault. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED`: Misaligned scalar + accesses are not supported at all and will generate a misaligned address + fault. * :c:macro:`RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE`: An unsigned int which represents the size of the Zicboz block in bytes. diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 635753084275..1e153cda57db 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -83,6 +83,11 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS 7 #define RISCV_HWPROBE_KEY_TIME_CSR_FREQ 8 #define RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF 9 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN 0 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED 1 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW 2 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_FAST 3 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED 4 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 2d0f4f6a32c3..cea0ca2bf2a2 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -178,13 +178,13 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) perf = this_perf; if (perf != this_perf) { - perf = RISCV_HWPROBE_MISALIGNED_UNKNOWN; + perf = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; break; } } if (perf == -1ULL) - return RISCV_HWPROBE_MISALIGNED_UNKNOWN; + return RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; return perf; } @@ -192,12 +192,12 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) static u64 hwprobe_misaligned(const struct cpumask *cpus) { if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS)) - return RISCV_HWPROBE_MISALIGNED_FAST; + return RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available()) - return RISCV_HWPROBE_MISALIGNED_EMULATED; + return RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; - return RISCV_HWPROBE_MISALIGNED_SLOW; + return RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; } #endif diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index b62d5a2f4541..192cd5603e95 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -338,7 +338,7 @@ int handle_misaligned_load(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS - *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED; + *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; #endif if (!unaligned_enabled) @@ -532,13 +532,13 @@ static bool check_unaligned_access_emulated(int cpu) unsigned long tmp_var, tmp_val; bool misaligned_emu_detected; - *mas_ptr = RISCV_HWPROBE_MISALIGNED_UNKNOWN; + *mas_ptr = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; __asm__ __volatile__ ( " "REG_L" %[tmp], 1(%[ptr])\n" : [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory"); - misaligned_emu_detected = (*mas_ptr == RISCV_HWPROBE_MISALIGNED_EMULATED); + misaligned_emu_detected = (*mas_ptr == RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED); /* * If unaligned_ctl is already set, this means that we detected that all * CPUS uses emulated misaligned access at boot time. If that changed diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index a9a6bcb02acf..160628a2116d 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -34,9 +34,9 @@ static int check_unaligned_access(void *param) struct page *page = param; void *dst; void *src; - long speed = RISCV_HWPROBE_MISALIGNED_SLOW; + long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; - if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) return 0; /* Make an unaligned destination buffer. */ @@ -95,14 +95,14 @@ static int check_unaligned_access(void *param) } if (word_cycles < byte_cycles) - speed = RISCV_HWPROBE_MISALIGNED_FAST; + speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; ratio = div_u64((byte_cycles * 100), word_cycles); pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", cpu, ratio / 100, ratio % 100, - (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); + (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow"); per_cpu(misaligned_access_speed, cpu) = speed; @@ -110,7 +110,7 @@ static int check_unaligned_access(void *param) * Set the value of fast_misaligned_access of a CPU. These operations * are atomic to avoid race conditions. */ - if (speed == RISCV_HWPROBE_MISALIGNED_FAST) + if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) cpumask_set_cpu(cpu, &fast_misaligned_access); else cpumask_clear_cpu(cpu, &fast_misaligned_access); @@ -188,7 +188,7 @@ static int riscv_online_cpu(unsigned int cpu) static struct page *buf; /* We are already set since the last check */ - if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) goto exit; buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); From d1a7b382a9d3f0f3e5a80e0be2991c075fa4f618 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Tue, 6 Aug 2024 16:43:24 +0100 Subject: [PATCH 0883/1052] netfilter: nfnetlink: Initialise extack before use in ACKs Add missing extack initialisation when ACKing BATCH_BEGIN and BATCH_END. Fixes: bf2ac490d28c ("netfilter: nfnetlink: Handle ACK flags for batch messages") Signed-off-by: Donald Hunter Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4abf660c7baf..932b3ddb34f1 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -427,8 +427,10 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, nfnl_unlock(subsys_id); - if (nlh->nlmsg_flags & NLM_F_ACK) + if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); nfnl_err_add(&err_list, nlh, 0, &extack); + } while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -577,6 +579,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } else if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); nfnl_err_add(&err_list, nlh, 0, &extack); } } else { From e9767137308daf906496613fd879808a07f006a2 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Tue, 6 Aug 2024 17:16:37 +0100 Subject: [PATCH 0884/1052] netfilter: flowtable: initialise extack before use Fix missing initialisation of extack in flow offload. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Donald Hunter Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index ff1a4e36c2b5..e06bc36f49fe 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -841,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; + struct netlink_ext_ack extack = {}; struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; From 7d8dc1c7be8d3509e8f5164dd5df64c8e34d7eeb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Aug 2024 21:28:41 +0200 Subject: [PATCH 0885/1052] netfilter: nf_queue: drop packets with cloned unconfirmed conntracks Conntrack assumes an unconfirmed entry (not yet committed to global hash table) has a refcount of 1 and is not visible to other cores. With multicast forwarding this assumption breaks down because such skbs get cloned after being picked up, i.e. ct->use refcount is > 1. Likewise, bridge netfilter will clone broad/mutlicast frames and all frames in case they need to be flood-forwarded during learning phase. For ip multicast forwarding or plain bridge flood-forward this will "work" because packets don't leave softirq and are implicitly serialized. With nfqueue this no longer holds true, the packets get queued and can be reinjected in arbitrary ways. Disable this feature, I see no other solution. After this patch, nfqueue cannot queue packets except the last multicast/broadcast packet. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 6 +++++- net/netfilter/nfnetlink_queue.c | 35 +++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 09f6a773a708..8f9c19d992ac 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -622,8 +622,12 @@ static unsigned int br_nf_local_in(void *priv, if (likely(nf_ct_is_confirmed(ct))) return NF_ACCEPT; + if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + WARN_ON_ONCE(skb_shared(skb)); - WARN_ON_ONCE(refcount_read(&nfct->use) != 1); /* We can't call nf_confirm here, it would create a dependency * on nf_conntrack module. diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 55e28e1da66e..e0716da256bf 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -820,10 +820,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; - const struct nf_conn *ct = (void *)skb_nfct(entry->skb); + struct nf_conn *ct = (void *)skb_nfct(entry->skb); + unsigned long status; + unsigned int use; - if (ct && ((ct->status & flags) == IPS_DYING)) + if (!ct) + return false; + + status = READ_ONCE(ct->status); + if ((status & flags) == IPS_DYING) return true; + + if (status & IPS_CONFIRMED) + return false; + + /* in some cases skb_clone() can occur after initial conntrack + * pickup, but conntrack assumes exclusive skb->_nfct ownership for + * unconfirmed entries. + * + * This happens for br_netfilter and with ip multicast routing. + * We can't be solved with serialization here because one clone could + * have been queued for local delivery. + */ + use = refcount_read(&ct->ct_general.use); + if (likely(use == 1)) + return false; + + /* Can't decrement further? Exclusive ownership. */ + if (!refcount_dec_not_one(&ct->ct_general.use)) + return false; + + skb_set_nfct(entry->skb, 0); + /* No nf_ct_put(): we already decremented .use and it cannot + * drop down to 0. + */ + return true; #endif return false; } From ea2306f0330c59ac8cd6ba13193497f0a6a02684 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 8 Aug 2024 23:14:43 +0200 Subject: [PATCH 0886/1052] selftests: netfilter: add test for br_netfilter+conntrack+queue combination Trigger cloned skbs leaving softirq protection. This triggers splat without the preceeding change ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks"): WARNING: at net/netfilter/nf_conntrack_core.c:1198 __nf_conntrack_confirm.. because local delivery and forwarding will race for confirmation. Based on a reproducer script from Yi Chen. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../testing/selftests/net/netfilter/Makefile | 1 + .../net/netfilter/br_netfilter_queue.sh | 78 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100755 tools/testing/selftests/net/netfilter/br_netfilter_queue.sh diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 47945b2b3f92..d13fb5ea3e89 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -7,6 +7,7 @@ MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) TEST_PROGS := br_netfilter.sh bridge_brouter.sh +TEST_PROGS += br_netfilter_queue.sh TEST_PROGS += conntrack_icmp_related.sh TEST_PROGS += conntrack_ipip_mtu.sh TEST_PROGS += conntrack_tcp_unreplied.sh diff --git a/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh new file mode 100755 index 000000000000..6a764d70ab06 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +source lib.sh + +checktool "nft --version" "run test without nft tool" + +cleanup() { + cleanup_all_ns +} + +setup_ns c1 c2 c3 sender + +trap cleanup EXIT + +nf_queue_wait() +{ + grep -q "^ *$1 " "/proc/self/net/netfilter/nfnetlink_queue" +} + +port_add() { + ns="$1" + dev="$2" + a="$3" + + ip link add name "$dev" type veth peer name "$dev" netns "$ns" + + ip -net "$ns" addr add 192.168.1."$a"/24 dev "$dev" + ip -net "$ns" link set "$dev" up + + ip link set "$dev" master br0 + ip link set "$dev" up +} + +[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } + +ip link add br0 type bridge +ip addr add 192.168.1.254/24 dev br0 + +port_add "$c1" "c1" 1 +port_add "$c2" "c2" 2 +port_add "$c3" "c3" 3 +port_add "$sender" "sender" 253 + +ip link set br0 up + +modprobe -q br_netfilter + +sysctl net.bridge.bridge-nf-call-iptables=1 || exit 1 + +ip netns exec "$sender" ping -I sender -c1 192.168.1.1 || exit 1 +ip netns exec "$sender" ping -I sender -c1 192.168.1.2 || exit 2 +ip netns exec "$sender" ping -I sender -c1 192.168.1.3 || exit 3 + +nft -f /dev/stdin < /dev/null & + +busywait 5000 nf_queue_wait + +for i in $(seq 1 5); do conntrack -F > /dev/null 2> /dev/null; sleep 0.1 ; done & +ip netns exec "$sender" ping -I sender -f -c 50 -b 192.168.1.255 + +read t < /proc/sys/kernel/tainted +if [ "$t" -eq 0 ];then + echo PASS: kernel not tainted +else + echo ERROR: kernel is tainted + exit 1 +fi + +exit 0 From e0b6648b0446e59522819c75ba1dcb09e68d3e94 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 9 Aug 2024 15:07:30 +0200 Subject: [PATCH 0887/1052] netfilter: nf_tables: Audit log dump reset after the fact In theory, dumpreset may fail and invalidate the preceeding log message. Fix this and use the occasion to prepare for object reset locking, which benefits from a few unrelated changes: * Add an early call to nfnetlink_unicast if not resetting which effectively skips the audit logging but also unindents it. * Extract the table's name from the netlink attribute (which is verified via earlier table lookup) to not rely upon validity of the looked up table pointer. * Do not use local variable family, it will vanish. Fixes: 8e6cf365e1d5 ("audit: log nftables configuration change events") Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 481ee78e77bc..4fa132715fcc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8055,6 +8055,7 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { + const struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; @@ -8064,6 +8065,7 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, struct sk_buff *skb2; bool reset = false; u32 objtype; + char *buf; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { @@ -8102,27 +8104,23 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) reset = true; - if (reset) { - const struct nftables_pernet *nft_net; - char *buf; - - nft_net = nft_pernet(net); - buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); - - audit_log_nfcfg(buf, - family, - 1, - AUDIT_NFT_OP_OBJ_RESET, - GFP_ATOMIC); - kfree(buf); - } - err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) goto err_fill_obj_info; + if (!reset) + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_OBJ_TABLE]), + (char *)nla_data(nla[NFTA_OBJ_TABLE]), + nft_net->base_seq); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); + kfree(buf); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_obj_info: From 69fc3e9e90f1afc11f4015e6b75d18ab9acee348 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 9 Aug 2024 15:07:31 +0200 Subject: [PATCH 0888/1052] netfilter: nf_tables: Introduce nf_tables_getobj_single Outsource the reply skb preparation for non-dump getrule requests into a distinct function. Prep work for object reset locking. Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 85 ++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4fa132715fcc..c12c9cae784d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8052,10 +8052,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const nla[]) +static struct sk_buff * +nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { - const struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; @@ -8063,11 +8063,51 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; - bool reset = false; u32 objtype; - char *buf; int err; + if (!nla[NFTA_OBJ_NAME] || + !nla[NFTA_OBJ_TYPE]) + return ERR_PTR(-EINVAL); + + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); + return ERR_CAST(table); + } + + objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); + return ERR_CAST(obj); + } + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb2) + return ERR_PTR(-ENOMEM); + + err = nf_tables_fill_obj_info(skb2, net, portid, + info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + family, table, obj, reset); + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } + + return skb2; +} + +static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + bool reset = false; + char *buf; + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_obj_start, @@ -8080,35 +8120,12 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - if (!nla[NFTA_OBJ_NAME] || - !nla[NFTA_OBJ_TYPE]) - return -EINVAL; - - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); - return PTR_ERR(table); - } - - objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); - if (IS_ERR(obj)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); - return PTR_ERR(obj); - } - - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) reset = true; - err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, - info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, - family, table, obj, reset); - if (err < 0) - goto err_fill_obj_info; + skb2 = nf_tables_getobj_single(portid, info, nla, reset); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); if (!reset) return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); @@ -8121,11 +8138,7 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); - -err_fill_obj_info: - kfree_skb(skb2); - return err; + return nfnetlink_unicast(skb2, net, portid); } static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) From bd662c4218f9648e888bebde9468146965f3f8a0 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 9 Aug 2024 15:07:32 +0200 Subject: [PATCH 0889/1052] netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests Objects' dump callbacks are not concurrency-safe per-se with reset bit set. If two CPUs perform a reset at the same time, at least counter and quota objects suffer from value underrun. Prevent this by introducing dedicated locking callbacks for nfnetlink and the asynchronous dump handling to serialize access. Fixes: 43da04a593d8 ("netfilter: nf_tables: atomic dump and reset for stateful objects") Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 74 ++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c12c9cae784d..0a2f79346958 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8020,6 +8020,19 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int nf_tables_dumpreset_obj(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_obj(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -8036,12 +8049,18 @@ static int nf_tables_dump_obj_start(struct netlink_callback *cb) if (nla[NFTA_OBJ_TYPE]) ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - ctx->reset = true; - return 0; } +static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) +{ + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_obj_start(cb); +} + static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -8101,12 +8120,8 @@ nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = nft_pernet(info->net); u32 portid = NETLINK_CB(skb).portid; - struct net *net = info->net; struct sk_buff *skb2; - bool reset = false; - char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -8120,15 +8135,46 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; - - skb2 = nf_tables_getobj_single(portid, info, nla, reset); + skb2 = nf_tables_getobj_single(portid, info, nla, false); if (IS_ERR(skb2)) return PTR_ERR(skb2); - if (!reset) - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, info->net, portid); +} + +static int nf_tables_getobj_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dumpreset_obj_start, + .dump = nf_tables_dumpreset_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getobj_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); + + if (IS_ERR(skb2)) + return PTR_ERR(skb2); buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), @@ -9421,7 +9467,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj, + .call = nf_tables_getobj_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, From 14d069d92951a3e150c0a81f2ca3b93e54da913b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 13 Aug 2024 09:12:53 -0700 Subject: [PATCH 0890/1052] i2c: tegra: Do not mark ACPI devices as irq safe On ACPI machines, the tegra i2c module encounters an issue due to a mutex being called inside a spinlock. This leads to the following bug: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 ... Call trace: __might_sleep __mutex_lock_common mutex_lock_nested acpi_subsys_runtime_resume rpm_resume tegra_i2c_xfer The problem arises because during __pm_runtime_resume(), the spinlock &dev->power.lock is acquired before rpm_resume() is called. Later, rpm_resume() invokes acpi_subsys_runtime_resume(), which relies on mutexes, triggering the error. To address this issue, devices on ACPI are now marked as not IRQ-safe, considering the dependency of acpi_subsys_runtime_resume() on mutexes. Fixes: bd2fdedbf2ba ("i2c: tegra: Add the ACPI support") Cc: # v5.17+ Co-developed-by: Michael van der Westhuizen Signed-off-by: Michael van der Westhuizen Signed-off-by: Breno Leitao Reviewed-by: Dmitry Osipenko Reviewed-by: Andy Shevchenko Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-tegra.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 85b31edc558d..1df5b4204142 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1802,9 +1802,9 @@ static int tegra_i2c_probe(struct platform_device *pdev) * domain. * * VI I2C device shouldn't be marked as IRQ-safe because VI I2C won't - * be used for atomic transfers. + * be used for atomic transfers. ACPI device is not IRQ safe also. */ - if (!IS_VI(i2c_dev)) + if (!IS_VI(i2c_dev) && !has_acpi_companion(i2c_dev->dev)) pm_runtime_irq_safe(i2c_dev->dev); pm_runtime_enable(i2c_dev->dev); From a4ae5c31e0f28d2c737583a06d4b29ffd6bbd622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Tue, 13 Aug 2024 14:45:05 +0200 Subject: [PATCH 0891/1052] selftests/bpf: convert get_current_cgroup_id_user to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_current_cgroup_id_user allows testing for bpf_get_current_cgroup_id() bpf API but is not integrated into test_progs, and so is not tested automatically in CI. Convert it to the test_progs framework to allow running it automatically. The most notable differences with the old test are the following: - the new test relies on autoattach instead of manually hooking/enabling the targeted tracepoint through perf_event, which reduces quite a lot the test code size - it also accesses bpf prog data through global variables instead of maps - sleep duration passed to nanosleep syscall has been reduced to its minimum to not impact overall CI duration (we only care about the syscall being properly triggered, not about the passed duration) Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-1-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/get_cgroup_id_user.c | 151 ------------------ .../prog_tests/cgroup_get_current_cgroup_id.c | 46 ++++++ .../selftests/bpf/progs/get_cgroup_id_kern.c | 26 +-- 5 files changed, 51 insertions(+), 176 deletions(-) delete mode 100644 tools/testing/selftests/bpf/get_cgroup_id_user.c create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 8f14d8faeb0b..7d4d5d3e0210 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -19,7 +19,6 @@ test_sock urandom_read test_sockmap test_lirc_mode2_user -get_cgroup_id_user test_skb_cgroup_id_user test_cgroup_storage test_flow_dissector diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f54185e96a95..8beade034caf 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -67,7 +67,7 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_sock test_sockmap get_cgroup_id_user \ + test_sock test_sockmap \ test_cgroup_storage \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 @@ -295,7 +295,6 @@ $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) -$(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c deleted file mode 100644 index aefd83ebdcd7..000000000000 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "cgroup_helpers.h" -#include "testing_helpers.h" - -#define CHECK(condition, tag, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - printf("%s:FAIL:%s ", __func__, tag); \ - printf(format); \ - } else { \ - printf("%s:PASS:%s\n", __func__, tag); \ - } \ - __ret; \ -}) - -static int bpf_find_map(const char *test, struct bpf_object *obj, - const char *name) -{ - struct bpf_map *map; - - map = bpf_object__find_map_by_name(obj, name); - if (!map) - return -1; - return bpf_map__fd(map); -} - -#define TEST_CGROUP "/test-bpf-get-cgroup-id/" - -int main(int argc, char **argv) -{ - const char *probe_name = "syscalls/sys_enter_nanosleep"; - const char *file = "get_cgroup_id_kern.bpf.o"; - int err, bytes, efd, prog_fd, pmu_fd; - int cgroup_fd, cgidmap_fd, pidmap_fd; - struct perf_event_attr attr = {}; - struct bpf_object *obj; - __u64 kcgid = 0, ucgid; - __u32 key = 0, pid; - int exit_code = 1; - char buf[256]; - const struct timespec req = { - .tv_sec = 1, - .tv_nsec = 0, - }; - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno)) - return 1; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); - if (CHECK(err, "bpf_prog_test_load", "err %d errno %d\n", err, errno)) - goto cleanup_cgroup_env; - - cgidmap_fd = bpf_find_map(__func__, obj, "cg_ids"); - if (CHECK(cgidmap_fd < 0, "bpf_find_map", "err %d errno %d\n", - cgidmap_fd, errno)) - goto close_prog; - - pidmap_fd = bpf_find_map(__func__, obj, "pidmap"); - if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n", - pidmap_fd, errno)) - goto close_prog; - - pid = getpid(); - bpf_map_update_elem(pidmap_fd, &key, &pid, 0); - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) { - snprintf(buf, sizeof(buf), - "/sys/kernel/tracing/events/%s/id", probe_name); - } else { - snprintf(buf, sizeof(buf), - "/sys/kernel/debug/tracing/events/%s/id", probe_name); - } - efd = open(buf, O_RDONLY, 0); - if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) - goto close_prog; - bytes = read(efd, buf, sizeof(buf)); - close(efd); - if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read", - "bytes %d errno %d\n", bytes, errno)) - goto close_prog; - - attr.config = strtol(buf, NULL, 0); - attr.type = PERF_TYPE_TRACEPOINT; - attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; - - /* attach to this pid so the all bpf invocations will be in the - * cgroup associated with this pid. - */ - pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0); - if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd, - errno)) - goto close_prog; - - err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); - if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err, - errno)) - goto close_pmu; - - err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); - if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err, - errno)) - goto close_pmu; - - /* trigger some syscalls */ - syscall(__NR_nanosleep, &req, NULL); - - err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid); - if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno)) - goto close_pmu; - - ucgid = get_cgroup_id(TEST_CGROUP); - if (CHECK(kcgid != ucgid, "compare_cgroup_id", - "kern cgid %llx user cgid %llx", kcgid, ucgid)) - goto close_pmu; - - exit_code = 0; - printf("%s:PASS\n", argv[0]); - -close_pmu: - close(pmu_fd); -close_prog: - bpf_object__close(obj); -cleanup_cgroup_env: - cleanup_cgroup_environment(); - return exit_code; -} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c b/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c new file mode 100644 index 000000000000..7a1643b03bf3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "test_progs.h" +#include "cgroup_helpers.h" +#include "get_cgroup_id_kern.skel.h" + +#define TEST_CGROUP "/test-bpf-get-cgroup-id/" + +void test_cgroup_get_current_cgroup_id(void) +{ + struct get_cgroup_id_kern *skel; + const struct timespec req = { + .tv_sec = 0, + .tv_nsec = 1, + }; + int cgroup_fd; + __u64 ucgid; + + cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup switch")) + return; + + skel = get_cgroup_id_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "load program")) + goto cleanup_cgroup; + + if (!ASSERT_OK(get_cgroup_id_kern__attach(skel), "attach bpf program")) + goto cleanup_progs; + + skel->bss->expected_pid = getpid(); + /* trigger the syscall on which is attached the tested prog */ + if (!ASSERT_OK(syscall(__NR_nanosleep, &req, NULL), "nanosleep")) + goto cleanup_progs; + + ucgid = get_cgroup_id(TEST_CGROUP); + + ASSERT_EQ(skel->bss->cg_id, ucgid, "compare cgroup ids"); + +cleanup_progs: + get_cgroup_id_kern__destroy(skel); +cleanup_cgroup: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c index 68587b1de34e..30fd504856c7 100644 --- a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c +++ b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c @@ -4,34 +4,16 @@ #include #include -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, __u32); - __type(value, __u64); -} cg_ids SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, __u32); - __type(value, __u32); -} pidmap SEC(".maps"); +__u64 cg_id; +__u64 expected_pid; SEC("tracepoint/syscalls/sys_enter_nanosleep") int trace(void *ctx) { __u32 pid = bpf_get_current_pid_tgid(); - __u32 key = 0, *expected_pid; - __u64 *val; - expected_pid = bpf_map_lookup_elem(&pidmap, &key); - if (!expected_pid || *expected_pid != pid) - return 0; - - val = bpf_map_lookup_elem(&cg_ids, &key); - if (val) - *val = bpf_get_current_cgroup_id(); + if (expected_pid == pid) + cg_id = bpf_get_current_cgroup_id(); return 0; } From 37a14cfd667a22662867b986925f1ad4205c17df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Tue, 13 Aug 2024 14:45:06 +0200 Subject: [PATCH 0892/1052] selftests/bpf: convert test_cgroup_storage to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_cgroup_storage is currently a standalone program which is not run when executing test_progs. Convert it to the test_progs framework so it can be automatically executed in CI. The conversion led to the following changes: - converted the raw bpf program in the userspace test file into a dedicated test program in progs/ dir - reduced the scope of cgroup_storage test: the content from this test overlaps with some other tests already present in test_progs, most notably netcnt and cgroup_storage_multi*. Those tests already check extensively local storage, per-cpu local storage, cgroups interaction, etc. So the new test only keep the part testing that the program return code (based on map content) properly leads to packet being passed or dropped. Reviewed-by: Alan Maguire Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-2-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 2 - .../selftests/bpf/prog_tests/cgroup_storage.c | 96 ++++++++++ .../selftests/bpf/progs/cgroup_storage.c | 24 +++ .../selftests/bpf/test_cgroup_storage.c | 174 ------------------ 5 files changed, 120 insertions(+), 177 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_storage.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_storage.c delete mode 100644 tools/testing/selftests/bpf/test_cgroup_storage.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 7d4d5d3e0210..fd7ae37024e2 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -20,7 +20,6 @@ urandom_read test_sockmap test_lirc_mode2_user test_skb_cgroup_id_user -test_cgroup_storage test_flow_dissector flow_dissector_load test_tcpnotify_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8beade034caf..99f8d6329694 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,7 +68,6 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_sock test_sockmap \ - test_cgroup_storage \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 TEST_INST_SUBDIRS := no_alu32 @@ -295,7 +294,6 @@ $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) -$(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tag: $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c b/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c new file mode 100644 index 000000000000..cf395715ced4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "cgroup_helpers.h" +#include "network_helpers.h" +#include "cgroup_storage.skel.h" + +#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/" +#define TEST_NS "cgroup_storage_ns" +#define PING_CMD "ping localhost -c 1 -W 1 -q" + +static int setup_network(struct nstoken **token) +{ + SYS(fail, "ip netns add %s", TEST_NS); + *token = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(*token, "open netns")) + goto cleanup_ns; + SYS(cleanup_ns, "ip link set lo up"); + + return 0; + +cleanup_ns: + SYS_NOFAIL("ip netns del %s", TEST_NS); +fail: + return -1; +} + +static void cleanup_network(struct nstoken *ns) +{ + close_netns(ns); + SYS_NOFAIL("ip netns del %s", TEST_NS); +} + +void test_cgroup_storage(void) +{ + struct bpf_cgroup_storage_key key; + struct cgroup_storage *skel; + struct nstoken *ns = NULL; + unsigned long long value; + int cgroup_fd; + int err; + + cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); + if (!ASSERT_OK_FD(cgroup_fd, "create cgroup")) + return; + + if (!ASSERT_OK(setup_network(&ns), "setup network")) + goto cleanup_cgroup; + + skel = cgroup_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "load program")) + goto cleanup_network; + + skel->links.bpf_prog = + bpf_program__attach_cgroup(skel->progs.bpf_prog, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_prog, "attach program")) + goto cleanup_progs; + + /* Check that one out of every two packets is dropped */ + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "first ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_NEQ(err, 0, "second ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "third ping"); + + err = bpf_map__get_next_key(skel->maps.cgroup_storage, NULL, &key, + sizeof(key)); + if (!ASSERT_OK(err, "get first key")) + goto cleanup_progs; + err = bpf_map__lookup_elem(skel->maps.cgroup_storage, &key, sizeof(key), + &value, sizeof(value), 0); + if (!ASSERT_OK(err, "first packet count read")) + goto cleanup_progs; + + /* Add one to the packet counter, check again packet filtering */ + value++; + err = bpf_map__update_elem(skel->maps.cgroup_storage, &key, sizeof(key), + &value, sizeof(value), 0); + if (!ASSERT_OK(err, "increment packet counter")) + goto cleanup_progs; + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "fourth ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_NEQ(err, 0, "fifth ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "sixth ping"); + +cleanup_progs: + cgroup_storage__destroy(skel); +cleanup_network: + cleanup_network(ns); +cleanup_cgroup: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_storage.c b/tools/testing/selftests/bpf/progs/cgroup_storage.c new file mode 100644 index 000000000000..db1e4d2d3281 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_storage.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, __u64); +} cgroup_storage SEC(".maps"); + +SEC("cgroup_skb/egress") +int bpf_prog(struct __sk_buff *skb) +{ + __u64 *counter; + + counter = bpf_get_local_storage(&cgroup_storage, 0); + __sync_fetch_and_add(counter, 1); + + /* Drop one out of every two packets */ + return (*counter & 1); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c deleted file mode 100644 index 0861ea60dcdd..000000000000 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -#include "bpf_util.h" -#include "cgroup_helpers.h" -#include "testing_helpers.h" - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/" - -int main(int argc, char **argv) -{ - struct bpf_insn prog[] = { - BPF_LD_MAP_FD(BPF_REG_1, 0), /* percpu map fd */ - BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - BPF_FUNC_get_local_storage), - BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1), - BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), - - BPF_LD_MAP_FD(BPF_REG_1, 0), /* map fd */ - BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - BPF_FUNC_get_local_storage), - BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), - BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), - BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1), - BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), - BPF_EXIT_INSN(), - }; - size_t insns_cnt = ARRAY_SIZE(prog); - int error = EXIT_FAILURE; - int map_fd, percpu_map_fd, prog_fd, cgroup_fd; - struct bpf_cgroup_storage_key key; - unsigned long long value; - unsigned long long *percpu_value; - int cpu, nproc; - - nproc = bpf_num_possible_cpus(); - percpu_value = malloc(sizeof(*percpu_value) * nproc); - if (!percpu_value) { - printf("Not enough memory for per-cpu area (%d cpus)\n", nproc); - goto err; - } - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - map_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_STORAGE, NULL, sizeof(key), - sizeof(value), 0, NULL); - if (map_fd < 0) { - printf("Failed to create map: %s\n", strerror(errno)); - goto out; - } - - percpu_map_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, NULL, - sizeof(key), sizeof(value), 0, NULL); - if (percpu_map_fd < 0) { - printf("Failed to create map: %s\n", strerror(errno)); - goto out; - } - - prog[0].imm = percpu_map_fd; - prog[7].imm = map_fd; - prog_fd = bpf_test_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); - if (prog_fd < 0) { - printf("Failed to load bpf program: %s\n", bpf_log_buf); - goto out; - } - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - - /* Attach the bpf program */ - if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) { - printf("Failed to attach bpf program\n"); - goto err; - } - - if (bpf_map_get_next_key(map_fd, NULL, &key)) { - printf("Failed to get the first key in cgroup storage\n"); - goto err; - } - - if (bpf_map_lookup_elem(map_fd, &key, &value)) { - printf("Failed to lookup cgroup storage 0\n"); - goto err; - } - - for (cpu = 0; cpu < nproc; cpu++) - percpu_value[cpu] = 1000; - - if (bpf_map_update_elem(percpu_map_fd, &key, percpu_value, 0)) { - printf("Failed to update the data in the cgroup storage\n"); - goto err; - } - - /* Every second packet should be dropped */ - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null")); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - - /* Check the counter in the cgroup local storage */ - if (bpf_map_lookup_elem(map_fd, &key, &value)) { - printf("Failed to lookup cgroup storage\n"); - goto err; - } - - if (value != 3) { - printf("Unexpected data in the cgroup storage: %llu\n", value); - goto err; - } - - /* Bump the counter in the cgroup local storage */ - value++; - if (bpf_map_update_elem(map_fd, &key, &value, 0)) { - printf("Failed to update the data in the cgroup storage\n"); - goto err; - } - - /* Every second packet should be dropped */ - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null")); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - - /* Check the final value of the counter in the cgroup local storage */ - if (bpf_map_lookup_elem(map_fd, &key, &value)) { - printf("Failed to lookup the cgroup storage\n"); - goto err; - } - - if (value != 7) { - printf("Unexpected data in the cgroup storage: %llu\n", value); - goto err; - } - - /* Check the final value of the counter in the percpu local storage */ - - for (cpu = 0; cpu < nproc; cpu++) - percpu_value[cpu] = 0; - - if (bpf_map_lookup_elem(percpu_map_fd, &key, percpu_value)) { - printf("Failed to lookup the per-cpu cgroup storage\n"); - goto err; - } - - value = 0; - for (cpu = 0; cpu < nproc; cpu++) - value += percpu_value[cpu]; - - if (value != nproc * 1000 + 6) { - printf("Unexpected data in the per-cpu cgroup storage\n"); - goto err; - } - - error = 0; - printf("test_cgroup_storage:PASS\n"); - -err: - cleanup_cgroup_environment(); - free(percpu_value); - -out: - return error; -} From 7b4400a0a69b8255a2afed0bead0ce2c2c457545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Tue, 13 Aug 2024 14:45:07 +0200 Subject: [PATCH 0893/1052] selftests/bpf: add proper section name to bpf prog and rename it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_skb_cgroup_id_kern.c is currently involved in a manual test. In its current form, it can not be used with the auto-generated skeleton APIs, because the section name is not valid to allow libbpf to deduce the program type. Update section name to allow skeleton APIs usage. Also rename the program name to make it shorter and more straighforward regarding the API it is testing. While doing so, make sure that test_skb_cgroup_id.sh passes to get a working reference before converting it to test_progs - update the obj name - fix loading issue (verifier rejecting the program when loaded through tc, because of map not found), by preloading the whole obj with bpftool Reviewed-by: Alan Maguire Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-3-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- .../{test_skb_cgroup_id_kern.c => cgroup_ancestor.c} | 2 +- tools/testing/selftests/bpf/test_skb_cgroup_id.sh | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) rename tools/testing/selftests/bpf/progs/{test_skb_cgroup_id_kern.c => cgroup_ancestor.c} (97%) diff --git a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c similarity index 97% rename from tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c rename to tools/testing/selftests/bpf/progs/cgroup_ancestor.c index 37aacc66cd68..4879645f5827 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c +++ b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c @@ -28,7 +28,7 @@ static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) bpf_map_update_elem(&cgroup_ids, &level, &id, 0); } -SEC("cgroup_id_logger") +SEC("tc") int log_cgroup_id(struct __sk_buff *skb) { /* Loop unroll can't be used here due to [1]. Unrolling manually. diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh index 515c2eafc97f..d7dad49175c2 100755 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh @@ -30,8 +30,10 @@ setup() wait_for_ip tc qdisc add dev ${TEST_IF} clsact - tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \ - sec ${BPF_PROG_SECTION} da + mkdir -p /sys/fs/bpf/${BPF_PROG_PIN} + bpftool prog loadall ${BPF_PROG_OBJ} /sys/fs/bpf/${BPF_PROG_PIN} type tc + tc filter add dev ${TEST_IF} egress bpf da object-pinned \ + /sys/fs/bpf/${BPF_PROG_PIN}/${BPF_PROG_NAME} BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ awk '/ id / {sub(/.* id /, "", $0); print($1)}') @@ -41,6 +43,7 @@ cleanup() { ip link del ${TEST_IF} 2>/dev/null || : ip link del ${TEST_IF_PEER} 2>/dev/null || : + rm -rf /sys/fs/bpf/${BPF_PROG_PIN} } main() @@ -54,8 +57,9 @@ DIR=$(dirname $0) TEST_IF="test_cgid_1" TEST_IF_PEER="test_cgid_2" MAX_PING_TRIES=5 -BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.bpf.o" -BPF_PROG_SECTION="cgroup_id_logger" +BPF_PROG_PIN="cgroup_ancestor" +BPF_PROG_OBJ="${DIR}/${BPF_PROG_PIN}.bpf.o" +BPF_PROG_NAME="log_cgroup_id" BPF_PROG_ID=0 PROG="${DIR}/test_skb_cgroup_id_user" type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6" From f957c230e173cf227566686015016d05f7102d27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Tue, 13 Aug 2024 14:45:08 +0200 Subject: [PATCH 0894/1052] selftests/bpf: convert test_skb_cgroup_id_user to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_skb_cgroup_id_user allows testing skb cgroup id retrieval at different levels, but is not integrated in test_progs, so it is not run automatically in CI. The test overlaps a bit with cgroup_skb_sk_lookup_kern, which is integrated in test_progs and test extensively skb cgroup helpers, but there is still one major difference between the two tests which justifies the conversion: cgroup_skb_sk_lookup_kern deals with a BPF_PROG_TYPE_CGROUP_SKB (attached on a cgroup), while test_skb_cgroup_id_user deals with a BPF_PROG_TYPE_SCHED_CLS (attached on a qdisc) Convert test_skb_cgroup_id_user into test_progs framework in order to run it automatically in CI. The main differences with the original test are the following: - rename the test to make it shorter and more straightforward regarding tested feature - the wrapping shell script has been dropped since every setup step is now handled in the main C test file - the test has been renamed for a shorter name and reflecting the tested API - add dedicated assert log per level to ease test failure debugging - use global variables instead of maps to access bpf prog data Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-4-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 +- .../bpf/prog_tests/cgroup_ancestor.c | 141 ++++++++++++++ .../selftests/bpf/progs/cgroup_ancestor.c | 41 ++-- .../selftests/bpf/test_skb_cgroup_id.sh | 67 ------- .../selftests/bpf/test_skb_cgroup_id_user.c | 183 ------------------ 6 files changed, 160 insertions(+), 276 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c delete mode 100755 tools/testing/selftests/bpf/test_skb_cgroup_id.sh delete mode 100644 tools/testing/selftests/bpf/test_skb_cgroup_id_user.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index fd7ae37024e2..99ffea1fa5c6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -19,7 +19,6 @@ test_sock urandom_read test_sockmap test_lirc_mode2_user -test_skb_cgroup_id_user test_flow_dissector flow_dissector_load test_tcpnotify_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 99f8d6329694..163b11014158 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -137,7 +137,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \ +TEST_GEN_PROGS_EXTENDED = \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ @@ -290,7 +290,6 @@ JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o NETWORK_HELPERS := $(OUTPUT)/network_helpers.o -$(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c b/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c new file mode 100644 index 000000000000..9250a1e9f9af --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "test_progs.h" +#include "network_helpers.h" +#include "cgroup_helpers.h" +#include "cgroup_ancestor.skel.h" + +#define CGROUP_PATH "/skb_cgroup_test" +#define TEST_NS "cgroup_ancestor_ns" +#define NUM_CGROUP_LEVELS 4 +#define WAIT_AUTO_IP_MAX_ATTEMPT 10 +#define DST_ADDR "::1" +#define DST_PORT 1234 +#define MAX_ASSERT_NAME 32 + +struct test_data { + struct cgroup_ancestor *skel; + struct bpf_tc_hook qdisc; + struct bpf_tc_opts tc_attach; + struct nstoken *ns; +}; + +static int send_datagram(void) +{ + unsigned char buf[] = "some random test data"; + struct sockaddr_in6 addr = { .sin6_family = AF_INET6, + .sin6_port = htons(DST_PORT), }; + int sock, n; + + if (!ASSERT_EQ(inet_pton(AF_INET6, DST_ADDR, &addr.sin6_addr), 1, + "inet_pton")) + return -1; + + sock = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_OK_FD(sock, "create socket")) + return sock; + + if (!ASSERT_OK(connect(sock, &addr, sizeof(addr)), "connect")) { + close(sock); + return -1; + } + + n = sendto(sock, buf, sizeof(buf), 0, (const struct sockaddr *)&addr, + sizeof(addr)); + close(sock); + return ASSERT_EQ(n, sizeof(buf), "send data") ? 0 : -1; +} + +static int setup_network(struct test_data *t) +{ + SYS(fail, "ip netns add %s", TEST_NS); + t->ns = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(t->ns, "open netns")) + goto cleanup_ns; + + SYS(close_ns, "ip link set lo up"); + + memset(&t->qdisc, 0, sizeof(t->qdisc)); + t->qdisc.sz = sizeof(t->qdisc); + t->qdisc.attach_point = BPF_TC_EGRESS; + t->qdisc.ifindex = if_nametoindex("lo"); + if (!ASSERT_NEQ(t->qdisc.ifindex, 0, "if_nametoindex")) + goto close_ns; + if (!ASSERT_OK(bpf_tc_hook_create(&t->qdisc), "qdisc add")) + goto close_ns; + + memset(&t->tc_attach, 0, sizeof(t->tc_attach)); + t->tc_attach.sz = sizeof(t->tc_attach); + t->tc_attach.prog_fd = bpf_program__fd(t->skel->progs.log_cgroup_id); + if (!ASSERT_OK(bpf_tc_attach(&t->qdisc, &t->tc_attach), "filter add")) + goto cleanup_qdisc; + + return 0; + +cleanup_qdisc: + bpf_tc_hook_destroy(&t->qdisc); +close_ns: + close_netns(t->ns); +cleanup_ns: + SYS_NOFAIL("ip netns del %s", TEST_NS); +fail: + return 1; +} + +static void cleanup_network(struct test_data *t) +{ + bpf_tc_detach(&t->qdisc, &t->tc_attach); + bpf_tc_hook_destroy(&t->qdisc); + close_netns(t->ns); + SYS_NOFAIL("ip netns del %s", TEST_NS); +} + +static void check_ancestors_ids(struct test_data *t) +{ + __u64 expected_ids[NUM_CGROUP_LEVELS]; + char assert_name[MAX_ASSERT_NAME]; + __u32 level; + + expected_ids[0] = get_cgroup_id("/.."); /* root cgroup */ + expected_ids[1] = get_cgroup_id(""); + expected_ids[2] = get_cgroup_id(CGROUP_PATH); + expected_ids[3] = 0; /* non-existent cgroup */ + + for (level = 0; level < NUM_CGROUP_LEVELS; level++) { + snprintf(assert_name, MAX_ASSERT_NAME, + "ancestor id at level %d", level); + ASSERT_EQ(t->skel->bss->cgroup_ids[level], expected_ids[level], + assert_name); + } +} + +void test_cgroup_ancestor(void) +{ + struct test_data t; + int cgroup_fd; + + t.skel = cgroup_ancestor__open_and_load(); + if (!ASSERT_OK_PTR(t.skel, "open and load")) + return; + + t.skel->bss->dport = htons(DST_PORT); + cgroup_fd = cgroup_setup_and_join(CGROUP_PATH); + if (cgroup_fd < 0) + goto cleanup_progs; + + if (setup_network(&t)) + goto cleanup_cgroups; + + if (send_datagram()) + goto cleanup_network; + + check_ancestors_ids(&t); + +cleanup_network: + cleanup_network(&t); +cleanup_cgroups: + close(cgroup_fd); + cleanup_cgroup_environment(); +cleanup_progs: + cgroup_ancestor__destroy(t.skel); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c index 4879645f5827..8c2deb4fc493 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c +++ b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c @@ -1,43 +1,38 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2018 Facebook -#include -#include - -#include - +#include #include - +#include +#include "bpf_tracing_net.h" #define NUM_CGROUP_LEVELS 4 -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, __u32); - __type(value, __u64); - __uint(max_entries, NUM_CGROUP_LEVELS); -} cgroup_ids SEC(".maps"); +__u64 cgroup_ids[NUM_CGROUP_LEVELS]; +__u16 dport; static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) { - __u64 id; - /* [1] &level passed to external function that may change it, it's * incompatible with loop unroll. */ - id = bpf_skb_ancestor_cgroup_id(skb, level); - bpf_map_update_elem(&cgroup_ids, &level, &id, 0); + cgroup_ids[level] = bpf_skb_ancestor_cgroup_id(skb, level); } SEC("tc") int log_cgroup_id(struct __sk_buff *skb) { - /* Loop unroll can't be used here due to [1]. Unrolling manually. - * Number of calls should be in sync with NUM_CGROUP_LEVELS. - */ - log_nth_level(skb, 0); - log_nth_level(skb, 1); - log_nth_level(skb, 2); - log_nth_level(skb, 3); + struct sock *sk = (void *)skb->sk; + + if (!sk) + return TC_ACT_OK; + + sk = bpf_core_cast(sk, struct sock); + if (sk->sk_protocol == IPPROTO_UDP && sk->sk_dport == dport) { + log_nth_level(skb, 0); + log_nth_level(skb, 1); + log_nth_level(skb, 2); + log_nth_level(skb, 3); + } return TC_ACT_OK; } diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh deleted file mode 100755 index d7dad49175c2..000000000000 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Copyright (c) 2018 Facebook - -set -eu - -wait_for_ip() -{ - local _i - echo -n "Wait for testing link-local IP to become available " - for _i in $(seq ${MAX_PING_TRIES}); do - echo -n "." - if $PING6 -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then - echo " OK" - return - fi - sleep 1 - done - echo 1>&2 "ERROR: Timeout waiting for test IP to become available." - exit 1 -} - -setup() -{ - # Create testing interfaces not to interfere with current environment. - ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} - ip link set ${TEST_IF} up - ip link set ${TEST_IF_PEER} up - - wait_for_ip - - tc qdisc add dev ${TEST_IF} clsact - mkdir -p /sys/fs/bpf/${BPF_PROG_PIN} - bpftool prog loadall ${BPF_PROG_OBJ} /sys/fs/bpf/${BPF_PROG_PIN} type tc - tc filter add dev ${TEST_IF} egress bpf da object-pinned \ - /sys/fs/bpf/${BPF_PROG_PIN}/${BPF_PROG_NAME} - - BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ - awk '/ id / {sub(/.* id /, "", $0); print($1)}') -} - -cleanup() -{ - ip link del ${TEST_IF} 2>/dev/null || : - ip link del ${TEST_IF_PEER} 2>/dev/null || : - rm -rf /sys/fs/bpf/${BPF_PROG_PIN} -} - -main() -{ - trap cleanup EXIT 2 3 6 15 - setup - ${PROG} ${TEST_IF} ${BPF_PROG_ID} -} - -DIR=$(dirname $0) -TEST_IF="test_cgid_1" -TEST_IF_PEER="test_cgid_2" -MAX_PING_TRIES=5 -BPF_PROG_PIN="cgroup_ancestor" -BPF_PROG_OBJ="${DIR}/${BPF_PROG_PIN}.bpf.o" -BPF_PROG_NAME="log_cgroup_id" -BPF_PROG_ID=0 -PROG="${DIR}/test_skb_cgroup_id_user" -type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6" - -main diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c deleted file mode 100644 index ed518d075d1d..000000000000 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c +++ /dev/null @@ -1,183 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#include -#include -#include - -#include -#include -#include -#include -#include - - -#include -#include - -#include "cgroup_helpers.h" - -#define CGROUP_PATH "/skb_cgroup_test" -#define NUM_CGROUP_LEVELS 4 - -/* RFC 4291, Section 2.7.1 */ -#define LINKLOCAL_MULTICAST "ff02::1" - -static int mk_dst_addr(const char *ip, const char *iface, - struct sockaddr_in6 *dst) -{ - memset(dst, 0, sizeof(*dst)); - - dst->sin6_family = AF_INET6; - dst->sin6_port = htons(1025); - - if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) { - log_err("Invalid IPv6: %s", ip); - return -1; - } - - dst->sin6_scope_id = if_nametoindex(iface); - if (!dst->sin6_scope_id) { - log_err("Failed to get index of iface: %s", iface); - return -1; - } - - return 0; -} - -static int send_packet(const char *iface) -{ - struct sockaddr_in6 dst; - char msg[] = "msg"; - int err = 0; - int fd = -1; - - if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst)) - goto err; - - fd = socket(AF_INET6, SOCK_DGRAM, 0); - if (fd == -1) { - log_err("Failed to create UDP socket"); - goto err; - } - - if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst, - sizeof(dst)) == -1) { - log_err("Failed to send datagram"); - goto err; - } - - goto out; -err: - err = -1; -out: - if (fd >= 0) - close(fd); - return err; -} - -int get_map_fd_by_prog_id(int prog_id) -{ - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - __u32 map_ids[1]; - int prog_fd = -1; - int map_fd = -1; - - prog_fd = bpf_prog_get_fd_by_id(prog_id); - if (prog_fd < 0) { - log_err("Failed to get fd by prog id %d", prog_id); - goto err; - } - - info.nr_map_ids = 1; - info.map_ids = (__u64) (unsigned long) map_ids; - - if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) { - log_err("Failed to get info by prog fd %d", prog_fd); - goto err; - } - - if (!info.nr_map_ids) { - log_err("No maps found for prog fd %d", prog_fd); - goto err; - } - - map_fd = bpf_map_get_fd_by_id(map_ids[0]); - if (map_fd < 0) - log_err("Failed to get fd by map id %d", map_ids[0]); -err: - if (prog_fd >= 0) - close(prog_fd); - return map_fd; -} - -int check_ancestor_cgroup_ids(int prog_id) -{ - __u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS]; - __u32 level; - int err = 0; - int map_fd; - - expected_ids[0] = get_cgroup_id("/.."); /* root cgroup */ - expected_ids[1] = get_cgroup_id(""); - expected_ids[2] = get_cgroup_id(CGROUP_PATH); - expected_ids[3] = 0; /* non-existent cgroup */ - - map_fd = get_map_fd_by_prog_id(prog_id); - if (map_fd < 0) - goto err; - - for (level = 0; level < NUM_CGROUP_LEVELS; ++level) { - if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) { - log_err("Failed to lookup key %d", level); - goto err; - } - if (actual_ids[level] != expected_ids[level]) { - log_err("%llx (actual) != %llx (expected), level: %u\n", - actual_ids[level], expected_ids[level], level); - goto err; - } - } - - goto out; -err: - err = -1; -out: - if (map_fd >= 0) - close(map_fd); - return err; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - if (argc < 3) { - fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]); - exit(EXIT_FAILURE); - } - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - cgfd = cgroup_setup_and_join(CGROUP_PATH); - if (cgfd < 0) - goto err; - - if (send_packet(argv[1])) - goto err; - - if (check_ancestor_cgroup_ids(atoi(argv[2]))) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - printf("[%s]\n", err ? "FAIL" : "PASS"); - return err; -} From 6c569b77f0300f8a9960277c7094fa0f128eb811 Mon Sep 17 00:00:00 2001 From: Abhinav Jain Date: Wed, 14 Aug 2024 13:37:43 +0530 Subject: [PATCH 0895/1052] selftest: af_unix: Fix kselftest compilation warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change expected_buf from (const void *) to (const char *) in function __recvpair(). This change fixes the below warnings during test compilation: ``` In file included from msg_oob.c:14: msg_oob.c: In function ‘__recvpair’: ../../kselftest_harness.h:106:40: warning: format ‘%s’ expects argument of type ‘char *’,but argument 6 has type ‘const void *’ [-Wformat=] ../../kselftest_harness.h:101:17: note: in expansion of macro ‘__TH_LOG’ msg_oob.c:235:17: note: in expansion of macro ‘TH_LOG’ ../../kselftest_harness.h:106:40: warning: format ‘%s’ expects argument of type ‘char *’,but argument 6 has type ‘const void *’ [-Wformat=] ../../kselftest_harness.h:101:17: note: in expansion of macro ‘__TH_LOG’ msg_oob.c:259:25: note: in expansion of macro ‘TH_LOG’ ``` Fixes: d098d77232c3 ("selftest: af_unix: Add msg_oob.c.") Signed-off-by: Abhinav Jain Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240814080743.1156166-1-jain.abhinav177@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/af_unix/msg_oob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 16d0c172eaeb..535eb2c3d7d1 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -209,7 +209,7 @@ static void __sendpair(struct __test_metadata *_metadata, static void __recvpair(struct __test_metadata *_metadata, FIXTURE_DATA(msg_oob) *self, - const void *expected_buf, int expected_len, + const char *expected_buf, int expected_len, int buf_len, int flags) { int i, ret[2], recv_errno[2], expected_errno = 0; From 0863bffda1131fd2fa9c05b653ad9ee3d8db127e Mon Sep 17 00:00:00 2001 From: Griffin Kroah-Hartman Date: Wed, 14 Aug 2024 13:17:47 +0200 Subject: [PATCH 0896/1052] Revert "serial: 8250_omap: Set the console genpd always on if no console suspend" This reverts commit 68e6939ea9ec3d6579eadeab16060339cdeaf940. Kevin reported that this causes a crash during suspend on platforms that dont use PM domains. Link: https://lore.kernel.org/r/7ha5hgpchq.fsf@baylibre.com Cc: Thomas Richard Fixes: 68e6939ea9ec ("serial: 8250_omap: Set the console genpd always on if no console suspend") Cc: stable Reported-by: Kevin Hilman Signed-off-by: Griffin Kroah-Hartman Link: https://lore.kernel.org/r/20240814111747.82371-1-griffin@kroah.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_omap.c | 33 +++++------------------------ 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index 1af9aed99c65..afef1dd4ddf4 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -27,7 +27,6 @@ #include #include #include -#include #include "8250.h" @@ -119,12 +118,6 @@ #define UART_OMAP_TO_L 0x26 #define UART_OMAP_TO_H 0x27 -/* - * Copy of the genpd flags for the console. - * Only used if console suspend is disabled - */ -static unsigned int genpd_flags_console; - struct omap8250_priv { void __iomem *membase; int line; @@ -1655,7 +1648,6 @@ static int omap8250_suspend(struct device *dev) { struct omap8250_priv *priv = dev_get_drvdata(dev); struct uart_8250_port *up = serial8250_get_port(priv->line); - struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain); int err = 0; serial8250_suspend_port(priv->line); @@ -1666,19 +1658,8 @@ static int omap8250_suspend(struct device *dev) if (!device_may_wakeup(dev)) priv->wer = 0; serial_out(up, UART_OMAP_WER, priv->wer); - if (uart_console(&up->port)) { - if (console_suspend_enabled) - err = pm_runtime_force_suspend(dev); - else { - /* - * The pd shall not be powered-off (no console suspend). - * Make copy of genpd flags before to set it always on. - * The original value is restored during the resume. - */ - genpd_flags_console = genpd->flags; - genpd->flags |= GENPD_FLAG_ALWAYS_ON; - } - } + if (uart_console(&up->port) && console_suspend_enabled) + err = pm_runtime_force_suspend(dev); flush_work(&priv->qos_work); return err; @@ -1688,16 +1669,12 @@ static int omap8250_resume(struct device *dev) { struct omap8250_priv *priv = dev_get_drvdata(dev); struct uart_8250_port *up = serial8250_get_port(priv->line); - struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain); int err; if (uart_console(&up->port) && console_suspend_enabled) { - if (console_suspend_enabled) { - err = pm_runtime_force_resume(dev); - if (err) - return err; - } else - genpd->flags = genpd_flags_console; + err = pm_runtime_force_resume(dev); + if (err) + return err; } serial8250_resume_port(priv->line); From f75c235565f90c4a17b125e47f1c68ef6b8c2bce Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 14 Aug 2024 02:09:53 -0700 Subject: [PATCH 0897/1052] arm64: Fix KASAN random tag seed initialization Currently, kasan_init_sw_tags() is called before setup_per_cpu_areas(), so per_cpu(prng_state, cpu) accesses the same address regardless of the value of "cpu", and the same seed value gets copied to the percpu area for every CPU. Fix this by moving the call to smp_prepare_boot_cpu(), which is the first architecture hook after setup_per_cpu_areas(). Fixes: 3c9e3aa11094 ("kasan: add tag related helper functions") Fixes: 3f41b6093823 ("kasan: fix random seed generation for tag-based mode") Signed-off-by: Samuel Holland Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/r/20240814091005.969756-1-samuel.holland@sifive.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/setup.c | 3 --- arch/arm64/kernel/smp.c | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index a096e2451044..b22d28ec8028 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -355,9 +355,6 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); - /* Init percpu seeds for random tags after cpus are set up. */ - kasan_init_sw_tags(); - #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 5e18fbcee9a2..f01f0fd7b7fe 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -467,6 +467,8 @@ void __init smp_prepare_boot_cpu(void) init_gic_priority_masking(); kasan_init_hw_tags(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_sw_tags(); } /* From 69139d2919dd4aa9a553c8245e7c63e82613e3fc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 11 Aug 2024 19:21:53 -0700 Subject: [PATCH 0898/1052] vsock: fix recursive ->recvmsg calls After a vsock socket has been added to a BPF sockmap, its prot->recvmsg has been replaced with vsock_bpf_recvmsg(). Thus the following recursiion could happen: vsock_bpf_recvmsg() -> __vsock_recvmsg() -> vsock_connectible_recvmsg() -> prot->recvmsg() -> vsock_bpf_recvmsg() again We need to fix it by calling the original ->recvmsg() without any BPF sockmap logic in __vsock_recvmsg(). Fixes: 634f1a7110b4 ("vsock: support sockmap") Reported-by: syzbot+bdb4bd87b5e22058e2a4@syzkaller.appspotmail.com Tested-by: syzbot+bdb4bd87b5e22058e2a4@syzkaller.appspotmail.com Cc: Bobby Eshleman Cc: Michael S. Tsirkin Cc: Stefano Garzarella Signed-off-by: Cong Wang Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20240812022153.86512-1-xiyou.wangcong@gmail.com Signed-off-by: Paolo Abeni --- include/net/af_vsock.h | 4 ++++ net/vmw_vsock/af_vsock.c | 50 +++++++++++++++++++++++---------------- net/vmw_vsock/vsock_bpf.c | 4 ++-- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 535701efc1e5..24d970f7a4fa 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -230,8 +230,12 @@ struct vsock_tap { int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); +int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); +int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags); int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4b040285aa78..0ff9b2dd86ba 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1270,25 +1270,28 @@ static int vsock_dgram_connect(struct socket *sock, return err; } +int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct vsock_sock *vsk = vsock_sk(sk); + + return vsk->transport->dgram_dequeue(vsk, msg, len, flags); +} + int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; const struct proto *prot; -#endif - struct vsock_sock *vsk; - struct sock *sk; - sk = sock->sk; - vsk = vsock_sk(sk); - -#ifdef CONFIG_BPF_SYSCALL prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags, NULL); #endif - return vsk->transport->dgram_dequeue(vsk, msg, len, flags); + return __vsock_dgram_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); @@ -2174,15 +2177,12 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, } int -vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +__vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; -#ifdef CONFIG_BPF_SYSCALL - const struct proto *prot; -#endif int err; sk = sock->sk; @@ -2233,14 +2233,6 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto out; } -#ifdef CONFIG_BPF_SYSCALL - prot = READ_ONCE(sk->sk_prot); - if (prot != &vsock_proto) { - release_sock(sk); - return prot->recvmsg(sk, msg, len, flags, NULL); - } -#endif - if (sk->sk_type == SOCK_STREAM) err = __vsock_stream_recvmsg(sk, msg, len, flags); else @@ -2250,6 +2242,22 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, release_sock(sk); return err; } + +int +vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ +#ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; + const struct proto *prot; + + prot = READ_ONCE(sk->sk_prot); + if (prot != &vsock_proto) + return prot->recvmsg(sk, msg, len, flags, NULL); +#endif + + return __vsock_connectible_recvmsg(sock, msg, len, flags); +} EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); static int vsock_set_rcvlowat(struct sock *sk, int val) diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index a3c97546ab84..c42c5cc18f32 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -64,9 +64,9 @@ static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int int err; if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) - err = vsock_connectible_recvmsg(sock, msg, len, flags); + err = __vsock_connectible_recvmsg(sock, msg, len, flags); else if (sk->sk_type == SOCK_DGRAM) - err = vsock_dgram_recvmsg(sock, msg, len, flags); + err = __vsock_dgram_recvmsg(sock, msg, len, flags); else err = -EPROTOTYPE; From fde25c20f51807db340b875953cfd1cedaa392fc Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Mon, 12 Aug 2024 17:08:24 +0300 Subject: [PATCH 0899/1052] net: ethtool: Allow write mechanism of LPL and both LPL and EPL CMIS 5.2 standard section 9.4.2 defines four types of firmware update supported mechanism: None, only LPL, only EPL, both LPL and EPL. Currently, only LPL (Local Payload) type of write firmware block is supported. However, if the module supports both LPL and EPL the flashing process wrongly fails for no supporting LPL. Fix that, by allowing the write mechanism to be LPL or both LPL and EPL. Fixes: c4f78134d45c ("ethtool: cmis_fw_update: add a layer for supporting firmware update using CDB") Reported-by: Vladyslav Mykhaliuk Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Link: https://patch.msgid.link/20240812140824.3718826-1-danieller@nvidia.com Signed-off-by: Paolo Abeni --- net/ethtool/cmis_fw_update.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index ae4b4b28a601..655ff5224ffa 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -35,7 +35,10 @@ struct cmis_cdb_fw_mng_features_rpl { __be16 resv7; }; -#define CMIS_CDB_FW_WRITE_MECHANISM_LPL 0x01 +enum cmis_cdb_fw_write_mechanism { + CMIS_CDB_FW_WRITE_MECHANISM_LPL = 0x01, + CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11, +}; static int cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, @@ -64,7 +67,8 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, } rpl = (struct cmis_cdb_fw_mng_features_rpl *)args.req.payload; - if (!(rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL)) { + if (!(rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL || + rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_BOTH)) { ethnl_module_fw_flash_ntf_err(dev, ntf_params, "Write LPL is not supported", NULL); From 1f1b194284093d619c9fbc7e9e38b2c68d0408e8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 12 Aug 2024 15:13:22 +0100 Subject: [PATCH 0900/1052] net: thunder_bgx: Fix netdev structure allocation Commit 94833addfaba ("net: thunderx: Unembed netdev structure") had a go at dynamically allocating the netdev structures for the thunderx_bgx driver. This change results in my ThunderX box catching fire (to be fair, it is what it does best). The issues with this change are that: - bgx_lmac_enable() is called *after* bgx_acpi_register_phy() and bgx_init_of_phy(), both expecting netdev to be a valid pointer. - bgx_init_of_phy() populates the MAC addresses for *all* LMACs attached to a given BGX instance, and thus needs netdev for each of them to have been allocated. There is a few things to be said about how the driver mixes LMAC and BGX states which leads to this sorry state, but that's beside the point. To address this, go back to a situation where all netdev structures are allocated before the driver starts relying on them, and move the freeing of these structures to driver removal. Someone brave enough can always go and restructure the driver if they want. Fixes: 94833addfaba ("net: thunderx: Unembed netdev structure") Signed-off-by: Marc Zyngier Cc: Breno Leitao Cc: Sunil Goutham Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Reviewed-by: Simon Horman Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20240812141322.1742918-1-maz@kernel.org Signed-off-by: Paolo Abeni --- .../net/ethernet/cavium/thunder/thunder_bgx.c | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index a40c266c37f2..608cc6af5af1 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1054,18 +1054,12 @@ static int phy_interface_mode(u8 lmac_type) static int bgx_lmac_enable(struct bgx *bgx, u8 lmacid) { - struct lmac *lmac, **priv; + struct lmac *lmac; u64 cfg; lmac = &bgx->lmac[lmacid]; lmac->bgx = bgx; - lmac->netdev = alloc_netdev_dummy(sizeof(struct lmac *)); - if (!lmac->netdev) - return -ENOMEM; - priv = netdev_priv(lmac->netdev); - *priv = lmac; - if ((lmac->lmac_type == BGX_MODE_SGMII) || (lmac->lmac_type == BGX_MODE_QSGMII) || (lmac->lmac_type == BGX_MODE_RGMII)) { @@ -1191,7 +1185,6 @@ static void bgx_lmac_disable(struct bgx *bgx, u8 lmacid) (lmac->lmac_type != BGX_MODE_10G_KR) && lmac->phydev) phy_disconnect(lmac->phydev); - free_netdev(lmac->netdev); lmac->phydev = NULL; } @@ -1653,6 +1646,23 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) bgx_get_qlm_mode(bgx); + for (lmac = 0; lmac < bgx->lmac_count; lmac++) { + struct lmac *lmacp, **priv; + + lmacp = &bgx->lmac[lmac]; + lmacp->netdev = alloc_netdev_dummy(sizeof(struct lmac *)); + + if (!lmacp->netdev) { + for (int i = 0; i < lmac; i++) + free_netdev(bgx->lmac[i].netdev); + err = -ENOMEM; + goto err_enable; + } + + priv = netdev_priv(lmacp->netdev); + *priv = lmacp; + } + err = bgx_init_phy(bgx); if (err) goto err_enable; @@ -1692,8 +1702,10 @@ static void bgx_remove(struct pci_dev *pdev) u8 lmac; /* Disable all LMACs */ - for (lmac = 0; lmac < bgx->lmac_count; lmac++) + for (lmac = 0; lmac < bgx->lmac_count; lmac++) { bgx_lmac_disable(bgx, lmac); + free_netdev(bgx->lmac[lmac].netdev); + } pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); From fd45cc614b8acca5bb435ba37fe9b3f9a17fab84 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Mon, 5 Aug 2024 13:08:56 +0200 Subject: [PATCH 0901/1052] drm/rockchip: inno-hdmi: Fix infoframe upload HDMI analyser shows that the AVI infoframe is no being longer send. The switch to the HDMI connector api should have used the frame content which is now given in the buffer parameter, but instead still uses the (now) empty and superfluous packed_frame variable. Fix it. Fixes: 65548c8ff0ab ("drm/rockchip: inno_hdmi: Switch to HDMI connector") Signed-off-by: Alex Bee Acked-by: Maxime Ripard Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20240805110855.274140-2-knaerzche@gmail.com --- drivers/gpu/drm/rockchip/inno_hdmi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/rockchip/inno_hdmi.c b/drivers/gpu/drm/rockchip/inno_hdmi.c index 2241e53a2946..dec6913cec5b 100644 --- a/drivers/gpu/drm/rockchip/inno_hdmi.c +++ b/drivers/gpu/drm/rockchip/inno_hdmi.c @@ -279,7 +279,6 @@ static int inno_hdmi_upload_frame(struct drm_connector *connector, const u8 *buffer, size_t len) { struct inno_hdmi *hdmi = connector_to_inno_hdmi(connector); - u8 packed_frame[HDMI_MAXIMUM_INFO_FRAME_SIZE]; ssize_t i; if (type != HDMI_INFOFRAME_TYPE_AVI) { @@ -291,8 +290,7 @@ static int inno_hdmi_upload_frame(struct drm_connector *connector, inno_hdmi_disable_frame(connector, type); for (i = 0; i < len; i++) - hdmi_writeb(hdmi, HDMI_CONTROL_PACKET_ADDR + i, - packed_frame[i]); + hdmi_writeb(hdmi, HDMI_CONTROL_PACKET_ADDR + i, buffer[i]); return 0; } From cdc90f75387c42d64a0ed1ba03550ea9447249d4 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 13 Aug 2024 09:37:19 +0200 Subject: [PATCH 0902/1052] pse-core: Conditionally set current limit during PI regulator registration Fix an issue where `devm_regulator_register()` would fail for PSE controllers that do not support current limit control, such as simple GPIO-based controllers like the podl-pse-regulator. The `REGULATOR_CHANGE_CURRENT` flag and `max_uA` constraint are now conditionally set only if the `pi_set_current_limit` operation is supported. This change prevents the regulator registration routine from attempting to call `pse_pi_set_current_limit()`, which would return `-EOPNOTSUPP` and cause the registration to fail. Fixes: 4a83abcef5f4f ("net: pse-pd: Add new power limit get and set c33 features") Signed-off-by: Oleksij Rempel Reviewed-by: Kory Maincent Tested-by: Kyle Swenson Link: https://patch.msgid.link/20240813073719.2304633-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/pse-pd/pse_core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index ec20953e0f82..4f032b16a8a0 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -401,9 +401,14 @@ devm_pse_pi_regulator_register(struct pse_controller_dev *pcdev, rdesc->ops = &pse_pi_ops; rdesc->owner = pcdev->owner; - rinit_data->constraints.valid_ops_mask = REGULATOR_CHANGE_STATUS | - REGULATOR_CHANGE_CURRENT; - rinit_data->constraints.max_uA = MAX_PI_CURRENT; + rinit_data->constraints.valid_ops_mask = REGULATOR_CHANGE_STATUS; + + if (pcdev->ops->pi_set_current_limit) { + rinit_data->constraints.valid_ops_mask |= + REGULATOR_CHANGE_CURRENT; + rinit_data->constraints.max_uA = MAX_PI_CURRENT; + } + rinit_data->supply_regulator = "vpwr"; rconfig.dev = pcdev->dev; From 7965a7f32a53d9ad807ce2c53bdda69ba104974f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 13 Aug 2024 15:39:34 +0200 Subject: [PATCH 0903/1052] selftests: net: lib: kill PIDs before del netns When deleting netns, it is possible to still have some tasks running, e.g. background tasks like tcpdump running in the background, not stopped because the test has been interrupted. Before deleting the netns, it is then safer to kill all attached PIDs, if any. That should reduce some noises after the end of some tests, and help with the debugging of some issues. That's why this modification is seen as a "fix". Fixes: 25ae948b4478 ("selftests/net: add lib.sh") Acked-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Acked-by: Florian Westphal Reviewed-by: Hangbin Liu Link: https://patch.msgid.link/20240813-upstream-net-20240813-selftests-net-lib-kill-v1-1-27b689b248b8@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/lib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index d0219032f773..8ee4489238ca 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -146,6 +146,7 @@ cleanup_ns() for ns in "$@"; do [ -z "${ns}" ] && continue + ip netns pids "${ns}" 2> /dev/null | xargs -r kill || true ip netns delete "${ns}" &> /dev/null || true if ! busywait $BUSYWAIT_TIMEOUT ip netns list \| grep -vq "^$ns$" &> /dev/null; then echo "Warn: Failed to remove namespace $ns" From 8445d9d3c03101859663d34fda747f6a50947556 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Tue, 13 Aug 2024 22:10:20 +0800 Subject: [PATCH 0904/1052] net: hns3: fix wrong use of semaphore up Currently, if hns3 PF or VF FLR reset failed after five times retry, the reset done process will directly release the semaphore which has already released in hclge_reset_prepare_general. This will cause down operation fail. So this patch fixes it by adding reset state judgement. The up operation is only called after successful PF FLR reset. Fixes: 8627bdedc435 ("net: hns3: refactor the precedure of PF FLR") Fixes: f28368bb4542 ("net: hns3: refactor the procedure of VF FLR") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 82574ce0194f..125e04434611 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11516,8 +11516,8 @@ static void hclge_reset_done(struct hnae3_ae_dev *ae_dev) dev_err(&hdev->pdev->dev, "fail to rebuild, ret=%d\n", ret); hdev->reset_type = HNAE3_NONE_RESET; - clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); - up(&hdev->reset_sem); + if (test_and_clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) + up(&hdev->reset_sem); } static void hclge_clear_resetting_state(struct hclge_dev *hdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 3735d2fed11f..094a7c7b5592 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1747,8 +1747,8 @@ static void hclgevf_reset_done(struct hnae3_ae_dev *ae_dev) ret); hdev->reset_type = HNAE3_NONE_RESET; - clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); - up(&hdev->reset_sem); + if (test_and_clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state)) + up(&hdev->reset_sem); } static u32 hclgevf_get_fw_version(struct hnae3_handle *handle) From 30545e17eac1f50c5ef49644daf6af205100a965 Mon Sep 17 00:00:00 2001 From: Peiyang Wang Date: Tue, 13 Aug 2024 22:10:21 +0800 Subject: [PATCH 0905/1052] net: hns3: use the user's cfg after reset Consider the followed case that the user change speed and reset the net interface. Before the hw change speed successfully, the driver get old old speed from hw by timer task. After reset, the previous speed is config to hw. As a result, the new speed is configed successfully but lost after PF reset. The followed pictured shows more dirrectly. +------+ +----+ +----+ | USER | | PF | | HW | +---+--+ +-+--+ +-+--+ | ethtool -s 100G | | +------------------>| set speed 100G | | +--------------------->| | | set successfully | | |<---------------------+---+ | |query cfg (timer task)| | | +--------------------->| | handle speed | | return 200G | | changing event | ethtool --reset |<---------------------+ | (100G) +------------------>| cfg previous speed |<--+ | | after reset (200G) | | +--------------------->| | | +---+ | |query cfg (timer task)| | | +--------------------->| | handle speed | | return 100G | | changing event | |<---------------------+ | (200G) | | |<--+ | |query cfg (timer task)| | +--------------------->| | | return 200G | | |<---------------------+ | | | v v v This patch save new speed if hw change speed successfully, which will be used after reset successfully. Fixes: 2d03eacc0b7e ("net: hns3: Only update mac configuation when necessary") Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- .../hisilicon/hns3/hns3pf/hclge_main.c | 24 ++++++++++++++----- .../hisilicon/hns3/hns3pf/hclge_mdio.c | 3 +++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 125e04434611..465f0d582283 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2653,8 +2653,17 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed, { struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; + int ret; - return hclge_cfg_mac_speed_dup(hdev, speed, duplex, lane_num); + ret = hclge_cfg_mac_speed_dup(hdev, speed, duplex, lane_num); + + if (ret) + return ret; + + hdev->hw.mac.req_speed = speed; + hdev->hw.mac.req_duplex = duplex; + + return 0; } static int hclge_set_autoneg_en(struct hclge_dev *hdev, bool enable) @@ -2956,17 +2965,20 @@ static int hclge_mac_init(struct hclge_dev *hdev) if (!test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) hdev->hw.mac.duplex = HCLGE_MAC_FULL; - ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.speed, - hdev->hw.mac.duplex, hdev->hw.mac.lane_num); - if (ret) - return ret; - if (hdev->hw.mac.support_autoneg) { ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg); if (ret) return ret; } + if (!hdev->hw.mac.autoneg) { + ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed, + hdev->hw.mac.req_duplex, + hdev->hw.mac.lane_num); + if (ret) + return ret; + } + mac->link = 0; if (mac->user_fec_mode & BIT(HNAE3_FEC_USER_DEF)) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c index 85fb11de43a1..80079657afeb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c @@ -191,6 +191,9 @@ static void hclge_mac_adjust_link(struct net_device *netdev) if (ret) netdev_err(netdev, "failed to adjust link.\n"); + hdev->hw.mac.req_speed = (u32)speed; + hdev->hw.mac.req_duplex = (u8)duplex; + ret = hclge_cfg_flowctrl(hdev); if (ret) netdev_err(netdev, "failed to configure flow control.\n"); From be5e816d00a506719e9dbb1a9c861c5ced30a109 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Tue, 13 Aug 2024 22:10:22 +0800 Subject: [PATCH 0906/1052] net: hns3: fix a deadlock problem when config TC during resetting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When config TC during the reset process, may cause a deadlock, the flow is as below: pf reset start │ ▼ ...... setup tc │ │ ▼ ▼ DOWN: napi_disable() napi_disable()(skip) │ │ │ ▼ ▼ ...... ...... │ │ ▼ │ napi_enable() │ ▼ UINIT: netif_napi_del() │ ▼ ...... │ ▼ INIT: netif_napi_add() │ ▼ ...... global reset start │ │ ▼ ▼ UP: napi_enable()(skip) ...... │ │ ▼ ▼ ...... napi_disable() In reset process, the driver will DOWN the port and then UINIT, in this case, the setup tc process will UP the port before UINIT, so cause the problem. Adds a DOWN process in UINIT to fix it. Fixes: bb6b94a896d4 ("net: hns3: Add reset interface implementation in client") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a5fc0209d628..4cbc4d069a1f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -5724,6 +5724,9 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) struct net_device *netdev = handle->kinfo.netdev; struct hns3_nic_priv *priv = netdev_priv(netdev); + if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) + hns3_nic_net_stop(netdev); + if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) { netdev_warn(netdev, "already uninitialized\n"); return 0; From 86db7bfb06704ef17340eeae71c832f21cfce35c Mon Sep 17 00:00:00 2001 From: Peiyang Wang Date: Tue, 13 Aug 2024 22:10:23 +0800 Subject: [PATCH 0907/1052] net: hns3: void array out of bound when loop tnl_num When query reg inf of SSU, it loops tnl_num times. However, tnl_num comes from hardware and the length of array is a fixed value. To void array out of bound, make sure the loop time is not greater than the length of array Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index e132c2f09560..cc7f46c0b35f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -1598,8 +1598,7 @@ static void hclge_query_reg_info_of_ssu(struct hclge_dev *hdev) { u32 loop_para[HCLGE_MOD_MSG_PARA_ARRAY_MAX_SIZE] = {0}; struct hclge_mod_reg_common_msg msg; - u8 i, j, num; - u32 loop_time; + u8 i, j, num, loop_time; num = ARRAY_SIZE(hclge_ssu_reg_common_msg); for (i = 0; i < num; i++) { @@ -1609,7 +1608,8 @@ static void hclge_query_reg_info_of_ssu(struct hclge_dev *hdev) loop_time = 1; loop_para[0] = 0; if (msg.need_para) { - loop_time = hdev->ae_dev->dev_specs.tnl_num; + loop_time = min(hdev->ae_dev->dev_specs.tnl_num, + HCLGE_MOD_MSG_PARA_ARRAY_MAX_SIZE); for (j = 0; j < loop_time; j++) loop_para[j] = j + 1; } From 7660833d217528c8f2385528951ab820a031e4e3 Mon Sep 17 00:00:00 2001 From: Peiyang Wang Date: Tue, 13 Aug 2024 22:10:24 +0800 Subject: [PATCH 0908/1052] net: hns3: use correct release function during uninitialization pci_request_regions is called to apply for PCI I/O and memory resources when the driver is initialized, Therefore, when the driver is uninstalled, pci_release_regions should be used to release PCI I/O and memory resources instead of pci_release_mem_regions is used to release memory reasouces only. Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 465f0d582283..6c33195a1168 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11456,7 +11456,7 @@ static void hclge_pci_uninit(struct hclge_dev *hdev) pcim_iounmap(pdev, hdev->hw.hw.io_base); pci_free_irq_vectors(pdev); - pci_release_mem_regions(pdev); + pci_release_regions(pdev); pci_disable_device(pdev); } From ddeb7989a98faf8da67ac613731a0eee32667b7d Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 17 Jul 2024 07:04:28 -0700 Subject: [PATCH 0909/1052] drm/xe: Validate user fence during creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fail invalid addresses during user fence creation. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240717140429.1396820-1-matthew.brost@intel.com (cherry picked from commit 0fde907da2d5fd4da68845e96c6842497159c858) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_sync.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index c4e018aa2982..e8d31e010860 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -53,14 +53,18 @@ static struct xe_user_fence *user_fence_create(struct xe_device *xe, u64 addr, u64 value) { struct xe_user_fence *ufence; + u64 __user *ptr = u64_to_user_ptr(addr); + + if (!access_ok(ptr, sizeof(ptr))) + return ERR_PTR(-EFAULT); ufence = kmalloc(sizeof(*ufence), GFP_KERNEL); if (!ufence) - return NULL; + return ERR_PTR(-ENOMEM); ufence->xe = xe; kref_init(&ufence->refcount); - ufence->addr = u64_to_user_ptr(addr); + ufence->addr = ptr; ufence->value = value; ufence->mm = current->mm; mmgrab(ufence->mm); @@ -183,8 +187,8 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, } else { sync->ufence = user_fence_create(xe, sync_in.addr, sync_in.timeline_value); - if (XE_IOCTL_DBG(xe, !sync->ufence)) - return -ENOMEM; + if (XE_IOCTL_DBG(xe, IS_ERR(sync->ufence))) + return PTR_ERR(sync->ufence); } break; From e98a032c0340d45c199f4eb536359f5762a8748f Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:45 -0700 Subject: [PATCH 0910/1052] drm/xe: Move part of xe_file cleanup to a helper In order to make xe_file ref counted, move destruction of xe_file members to a helper. v2: Move xe_vm_close_and_put back into xe_file_close (Matt) Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-2-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 3d0c4a62cc553c6ffde4cb11620eba991e770665) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 76109415eba6..452dbc1b495b 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -90,9 +90,25 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file) return 0; } +static void xe_file_destroy(struct xe_file *xef) +{ + struct xe_device *xe = xef->xe; + + xa_destroy(&xef->exec_queue.xa); + mutex_destroy(&xef->exec_queue.lock); + xa_destroy(&xef->vm.xa); + mutex_destroy(&xef->vm.lock); + + spin_lock(&xe->clients.lock); + xe->clients.count--; + spin_unlock(&xe->clients.lock); + + xe_drm_client_put(xef->client); + kfree(xef); +} + static void xe_file_close(struct drm_device *dev, struct drm_file *file) { - struct xe_device *xe = to_xe_device(dev); struct xe_file *xef = file->driver_priv; struct xe_vm *vm; struct xe_exec_queue *q; @@ -108,21 +124,12 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file) xe_exec_queue_kill(q); xe_exec_queue_put(q); } - xa_destroy(&xef->exec_queue.xa); - mutex_destroy(&xef->exec_queue.lock); mutex_lock(&xef->vm.lock); xa_for_each(&xef->vm.xa, idx, vm) xe_vm_close_and_put(vm); mutex_unlock(&xef->vm.lock); - xa_destroy(&xef->vm.xa); - mutex_destroy(&xef->vm.lock); - spin_lock(&xe->clients.lock); - xe->clients.count--; - spin_unlock(&xe->clients.lock); - - xe_drm_client_put(xef->client); - kfree(xef); + xe_file_destroy(xef); } static const struct drm_ioctl_desc xe_ioctls[] = { From d28bb0120f360e772458a7cf295d6d0ae3dc18a4 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:46 -0700 Subject: [PATCH 0911/1052] drm/xe: Add ref counting for xe_file Add ref counting for xe_file. v2: - Add kernel doc for exported functions (Matt) - Instead of xe_file_destroy, export the get/put helpers (Lucas) v3: Fixup the kernel-doc format and description (Matt, Lucas) Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-3-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit ce8c161cbad43f4056451e541f7ae3471d0cca12) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 33 ++++++++++++++++++++++++++-- drivers/gpu/drm/xe/xe_device.h | 3 +++ drivers/gpu/drm/xe/xe_device_types.h | 3 +++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 452dbc1b495b..4178bd05701e 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -87,11 +87,14 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file) spin_unlock(&xe->clients.lock); file->driver_priv = xef; + kref_init(&xef->refcount); + return 0; } -static void xe_file_destroy(struct xe_file *xef) +static void xe_file_destroy(struct kref *ref) { + struct xe_file *xef = container_of(ref, struct xe_file, refcount); struct xe_device *xe = xef->xe; xa_destroy(&xef->exec_queue.xa); @@ -107,6 +110,32 @@ static void xe_file_destroy(struct xe_file *xef) kfree(xef); } +/** + * xe_file_get() - Take a reference to the xe file object + * @xef: Pointer to the xe file + * + * Anyone with a pointer to xef must take a reference to the xe file + * object using this call. + * + * Return: xe file pointer + */ +struct xe_file *xe_file_get(struct xe_file *xef) +{ + kref_get(&xef->refcount); + return xef; +} + +/** + * xe_file_put() - Drop a reference to the xe file object + * @xef: Pointer to the xe file + * + * Used to drop reference to the xef object + */ +void xe_file_put(struct xe_file *xef) +{ + kref_put(&xef->refcount, xe_file_destroy); +} + static void xe_file_close(struct drm_device *dev, struct drm_file *file) { struct xe_file *xef = file->driver_priv; @@ -129,7 +158,7 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file) xe_vm_close_and_put(vm); mutex_unlock(&xef->vm.lock); - xe_file_destroy(xef); + xe_file_put(xef); } static const struct drm_ioctl_desc xe_ioctls[] = { diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index bb07f5669dbb..b3952718b3c1 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -170,4 +170,7 @@ static inline bool xe_device_wedged(struct xe_device *xe) void xe_device_declare_wedged(struct xe_device *xe); +struct xe_file *xe_file_get(struct xe_file *xef); +void xe_file_put(struct xe_file *xef); + #endif diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 3bca6d344744..cbc582bcc90a 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -566,6 +566,9 @@ struct xe_file { /** @client: drm client */ struct xe_drm_client *client; + + /** @refcount: ref count of this xe file */ + struct kref refcount; }; #endif From 6309f9b1fc4de2daa1293fe12a488d765e60507d Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:47 -0700 Subject: [PATCH 0912/1052] drm/xe: Take a ref to xe file when user creates a VM Take a reference to xef when user creates the VM and put the reference when user destroys the VM. Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-4-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit a2387e69493df3de706f14e4573ee123d23d5d34) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 5b166fa03684..6bfcbd4e778a 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1601,6 +1601,10 @@ static void vm_destroy_work_func(struct work_struct *w) XE_WARN_ON(vm->pt_root[id]); trace_xe_vm_free(vm); + + if (vm->xef) + xe_file_put(vm->xef); + kfree(vm); } @@ -1916,7 +1920,7 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data, } args->vm_id = id; - vm->xef = xef; + vm->xef = xe_file_get(xef); /* Record BO memory for VM pagetable created against client */ for_each_tile(tile, xe, id) From 817c70e2ba278e9d5360833b1137ef8855ac1728 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:48 -0700 Subject: [PATCH 0913/1052] drm/xe: Fix use after free when client stats are captured xe_file_close triggers an asynchronous queue cleanup and then frees up the xef object. Since queue cleanup flushes all pending jobs and the KMD stores client usage stats into the xef object after jobs are flushed, we see a use-after-free for the xef object. Resolve this by taking a reference to xef from xe_exec_queue. While at it, revert an earlier change that contained a partial work around for this issue. v2: - Take a ref to xef even for the VM bind queue (Matt) - Squash patches relevant to that fix and work around (Lucas) v3: Fix typo (Lucas) Fixes: ce62827bc294 ("drm/xe: Do not access xe file when updating exec queue run_ticks") Fixes: 6109f24f87d7 ("drm/xe: Add helper to accumulate exec queue runtime") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/issues/1908 Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-5-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 2149ded63079449b8dddf9da38392632f155e6b5) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_drm_client.c | 5 +---- drivers/gpu/drm/xe/xe_exec_queue.c | 10 +++++++++- drivers/gpu/drm/xe/xe_exec_queue_types.h | 7 +++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c index 6a26923fa10e..7ddd59908334 100644 --- a/drivers/gpu/drm/xe/xe_drm_client.c +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -251,11 +251,8 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file) /* Accumulate all the exec queues from this client */ mutex_lock(&xef->exec_queue.lock); - xa_for_each(&xef->exec_queue.xa, i, q) { + xa_for_each(&xef->exec_queue.xa, i, q) xe_exec_queue_update_run_ticks(q); - xef->run_ticks[q->class] += q->run_ticks - q->old_run_ticks; - q->old_run_ticks = q->run_ticks; - } mutex_unlock(&xef->exec_queue.lock); /* Get the total GPU cycles */ diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 0ba37835849b..a39384bb9553 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -37,6 +37,10 @@ static void __xe_exec_queue_free(struct xe_exec_queue *q) { if (q->vm) xe_vm_put(q->vm); + + if (q->xef) + xe_file_put(q->xef); + kfree(q); } @@ -649,6 +653,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data, goto kill_exec_queue; args->exec_queue_id = id; + q->xef = xe_file_get(xef); return 0; @@ -762,6 +767,7 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) */ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { + struct xe_file *xef; struct xe_lrc *lrc; u32 old_ts, new_ts; @@ -773,6 +779,8 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) if (!q->vm || !q->vm->xef) return; + xef = q->vm->xef; + /* * Only sample the first LRC. For parallel submission, all of them are * scheduled together and we compensate that below by multiplying by @@ -783,7 +791,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) */ lrc = q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); - q->run_ticks += (new_ts - old_ts) * q->width; + xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; } void xe_exec_queue_kill(struct xe_exec_queue *q) diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 201588ec33c3..a35ce24c9798 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -38,6 +38,9 @@ enum xe_exec_queue_priority { * a kernel object. */ struct xe_exec_queue { + /** @xef: Back pointer to xe file if this is user created exec queue */ + struct xe_file *xef; + /** @gt: graphics tile this exec queue can submit to */ struct xe_gt *gt; /** @@ -139,10 +142,6 @@ struct xe_exec_queue { * Protected by @vm's resv. Unused if @vm == NULL. */ u64 tlb_flush_seqno; - /** @old_run_ticks: prior hw engine class run time in ticks for this exec queue */ - u64 old_run_ticks; - /** @run_ticks: hw engine class run time in ticks for this exec queue */ - u64 run_ticks; /** @lrc: logical ring context for this exec queue */ struct xe_lrc *lrc[]; }; From 64da63cd3f7d771bf8f240e72203da1f72aa3728 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 18 Jul 2024 22:31:55 +0200 Subject: [PATCH 0914/1052] drm/xe/vf: Fix register value lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should use the number of actual entries stored in the runtime register buffer, not the maximum number of entries that this buffer can hold, otherwise bsearch() may fail and we may miss the data and wrongly report unexpected access to some registers. Fixes: 4edadc41a3a4 ("drm/xe/vf: Use register values obtained from the PF") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Cc: Matt Roper Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240718203155.486-1-michal.wajdeczko@intel.com (cherry picked from commit ad16682db18f4414e53bba1ce0db75b08bdc4dff) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c index 41e46a00c01e..8892d6c2291e 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c @@ -850,7 +850,7 @@ static struct vf_runtime_reg *vf_lookup_reg(struct xe_gt *gt, u32 addr) xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); - return bsearch(&key, runtime->regs, runtime->regs_size, sizeof(key), + return bsearch(&key, runtime->regs, runtime->num_regs, sizeof(key), vf_runtime_reg_cmp); } From 55ea73aacfb9a92def840a7110a468c5a76caeb5 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:05 -0700 Subject: [PATCH 0915/1052] drm/xe: Build PM into GuC CT layer Take PM ref when any G2H are outstanding, drop when none are outstanding. To safely ensure we have PM ref when in the GuC CT layer, a PM ref needs to be held when scheduler messages are pending too. v2: - Add outer PM protections to xe_file_close (CI) v3: - Only take PM ref 0->1 and drop on 1->0 (Matthew Auld) v4: - Add assert to G2H increment function v5: - Rebase v6: - Declare xe as local variable in xe_file_close (CI) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Auld Cc: Rodrigo Vivi Cc: Nirmoy Das Signed-off-by: Matthew Brost Reviewed-by: Matthew Auld Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-5-matthew.brost@intel.com (cherry picked from commit d930c19fdff3109e97b610fa10943b7602efcabd) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 5 +++++ drivers/gpu/drm/xe/xe_guc_ct.c | 10 +++++++++- drivers/gpu/drm/xe/xe_guc_submit.c | 4 ++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 4178bd05701e..f2f1d8ddb221 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -138,11 +138,14 @@ void xe_file_put(struct xe_file *xef) static void xe_file_close(struct drm_device *dev, struct drm_file *file) { + struct xe_device *xe = to_xe_device(dev); struct xe_file *xef = file->driver_priv; struct xe_vm *vm; struct xe_exec_queue *q; unsigned long idx; + xe_pm_runtime_get(xe); + /* * No need for exec_queue.lock here as there is no contention for it * when FD is closing as IOCTLs presumably can't be modifying the @@ -159,6 +162,8 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file) mutex_unlock(&xef->vm.lock); xe_file_put(xef); + + xe_pm_runtime_put(xe); } static const struct drm_ioctl_desc xe_ioctls[] = { diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 7d2e937da1d8..64afc90ad2c5 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -327,6 +327,8 @@ static void xe_guc_ct_set_state(struct xe_guc_ct *ct, xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 || state == XE_GUC_CT_STATE_STOPPED); + if (ct->g2h_outstanding) + xe_pm_runtime_put(ct_to_xe(ct)); ct->g2h_outstanding = 0; ct->state = state; @@ -495,10 +497,15 @@ static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len) static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h) { xe_gt_assert(ct_to_gt(ct), g2h_len <= ct->ctbs.g2h.info.space); + xe_gt_assert(ct_to_gt(ct), (!g2h_len && !num_g2h) || + (g2h_len && num_g2h)); if (g2h_len) { lockdep_assert_held(&ct->fast_lock); + if (!ct->g2h_outstanding) + xe_pm_runtime_get_noresume(ct_to_xe(ct)); + ct->ctbs.g2h.info.space -= g2h_len; ct->g2h_outstanding += num_g2h; } @@ -511,7 +518,8 @@ static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space); ct->ctbs.g2h.info.space += g2h_len; - --ct->g2h_outstanding; + if (!--ct->g2h_outstanding) + xe_pm_runtime_put(ct_to_xe(ct)); } static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 8d7e7f4bbff7..6398629e6b4e 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1393,6 +1393,8 @@ static void guc_exec_queue_process_msg(struct xe_sched_msg *msg) default: XE_WARN_ON("Unknown message type"); } + + xe_pm_runtime_put(guc_to_xe(exec_queue_to_guc(msg->private_data))); } static const struct drm_sched_backend_ops drm_sched_ops = { @@ -1482,6 +1484,8 @@ static void guc_exec_queue_kill(struct xe_exec_queue *q) static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg *msg, u32 opcode) { + xe_pm_runtime_get_noresume(guc_to_xe(exec_queue_to_guc(q))); + INIT_LIST_HEAD(&msg->link); msg->opcode = opcode; msg->private_data = q; From 4f7652dcd339aca6678084d42fda999ecb19b624 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Tue, 6 Aug 2024 20:05:16 +0200 Subject: [PATCH 0916/1052] drm/xe/pf: Fix VF config validation on multi-GT platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When validating VF config on the media GT, we may wrongly report that VF is already partially configured on it, as we consider GGTT and LMEM provisioning done on the primary GT (since both GGTT and LMEM are tile-level resources, not a GT-level). This will cause skipping a VF auto-provisioning on the media-GT and in result will block a VF from successfully initialize that GT. Fix that by considering GGTT and LMEM configurations only when checking if a VF provisioning is complete, and omit GGTT and LMEM when reporting empty/partial provisioning. Fixes: 234670cea9a2 ("drm/xe/pf: Skip fair VFs provisioning if already provisioned") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240806180516.618-1-michal.wajdeczko@intel.com (cherry picked from commit 5bdacb0907c1f531995b6ba47b832ac3a0182ae9) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 4699b7836001..b6f0a7299c03 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1927,6 +1927,7 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) { struct xe_gt *primary_gt = gt_to_tile(gt)->primary_gt; struct xe_device *xe = gt_to_xe(gt); + bool is_primary = !xe_gt_is_media_type(gt); bool valid_ggtt, valid_ctxs, valid_dbs; bool valid_any, valid_all; @@ -1935,13 +1936,17 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) valid_dbs = pf_get_vf_config_dbs(gt, vfid); /* note that GuC doorbells are optional */ - valid_any = valid_ggtt || valid_ctxs || valid_dbs; - valid_all = valid_ggtt && valid_ctxs; + valid_any = valid_ctxs || valid_dbs; + valid_all = valid_ctxs; + + /* and GGTT/LMEM is configured on primary GT only */ + valid_all = valid_all && valid_ggtt; + valid_any = valid_any || (valid_ggtt && is_primary); if (IS_DGFX(xe)) { bool valid_lmem = pf_get_vf_config_ggtt(primary_gt, vfid); - valid_any = valid_any || valid_lmem; + valid_any = valid_any || (valid_lmem && is_primary); valid_all = valid_all && valid_lmem; } From 90be4cc6f7674a1478c4c750beeee3edd14aee38 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:02 -0700 Subject: [PATCH 0917/1052] drm/xe: Add xe_gt_tlb_invalidation_fence_init helper Other layers should not be touching struct xe_gt_tlb_invalidation_fence directly, add helper for initialization. v2: - Add dma_fence_get and list init to xe_gt_tlb_invalidation_fence_init Signed-off-by: Matthew Brost Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-2-matthew.brost@intel.com (cherry picked from commit a522b285c6b4b611406d59612a8d7241714d2e31) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 36 +++++++++++++++++++++ drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 3 ++ drivers/gpu/drm/xe/xe_pt.c | 26 +-------------- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index d9359976ab8b..92a18a0e4acd 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -508,3 +508,39 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len) return 0; } + +static const char * +invalidation_fence_get_driver_name(struct dma_fence *dma_fence) +{ + return "xe"; +} + +static const char * +invalidation_fence_get_timeline_name(struct dma_fence *dma_fence) +{ + return "invalidation_fence"; +} + +static const struct dma_fence_ops invalidation_fence_ops = { + .get_driver_name = invalidation_fence_get_driver_name, + .get_timeline_name = invalidation_fence_get_timeline_name, +}; + +/** + * xe_gt_tlb_invalidation_fence_init - Initialize TLB invalidation fence + * @gt: GT + * @fence: TLB invalidation fence to initialize + * + * Initialize TLB invalidation fence for use + */ +void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence) +{ + spin_lock_irq(>->tlb_invalidation.lock); + dma_fence_init(&fence->base, &invalidation_fence_ops, + >->tlb_invalidation.lock, + dma_fence_context_alloc(1), 1); + spin_unlock_irq(>->tlb_invalidation.lock); + INIT_LIST_HEAD(&fence->link); + dma_fence_get(&fence->base); +} diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index bf3bebd9f985..948f4a2f5214 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -26,4 +26,7 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno); int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); +void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence); + #endif /* _XE_GT_TLB_INVALIDATION_ */ diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index ade9e7a3a0ad..4c17a1ec8f8b 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1115,23 +1115,6 @@ struct invalidation_fence { u32 asid; }; -static const char * -invalidation_fence_get_driver_name(struct dma_fence *dma_fence) -{ - return "xe"; -} - -static const char * -invalidation_fence_get_timeline_name(struct dma_fence *dma_fence) -{ - return "invalidation_fence"; -} - -static const struct dma_fence_ops invalidation_fence_ops = { - .get_driver_name = invalidation_fence_get_driver_name, - .get_timeline_name = invalidation_fence_get_timeline_name, -}; - static void invalidation_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb) { @@ -1170,15 +1153,8 @@ static int invalidation_fence_init(struct xe_gt *gt, trace_xe_gt_tlb_invalidation_fence_create(gt_to_xe(gt), &ifence->base); - spin_lock_irq(>->tlb_invalidation.lock); - dma_fence_init(&ifence->base.base, &invalidation_fence_ops, - >->tlb_invalidation.lock, - dma_fence_context_alloc(1), 1); - spin_unlock_irq(>->tlb_invalidation.lock); + xe_gt_tlb_invalidation_fence_init(gt, &ifence->base); - INIT_LIST_HEAD(&ifence->base.link); - - dma_fence_get(&ifence->base.base); /* Ref for caller */ ifence->fence = fence; ifence->gt = gt; ifence->start = start; From 58bfe6674467f4c037e89111e6007f25b34d8bb3 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:03 -0700 Subject: [PATCH 0918/1052] drm/xe: Drop xe_gt_tlb_invalidation_wait Having two methods to wait on GT TLB invalidations is not ideal. Remove xe_gt_tlb_invalidation_wait and only use GT TLB invalidation fences. In addition to two methods being less than ideal, once GT TLB invalidations are coalesced the seqno cannot be assigned during xe_gt_tlb_invalidation_ggtt/range. Thus xe_gt_tlb_invalidation_wait would not have a seqno to wait one. A fence however can be armed and later signaled. v3: - Add explaination about coalescing to commit message v4: - Don't put dma fence if defined on stack (CI) v5: - Initialize ret to zero (CI) v6: - Use invalidation_fence_signal helper in tlb timeout (Matthew Auld) Signed-off-by: Matthew Brost Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-3-matthew.brost@intel.com (cherry picked from commit 61ac035361ae555ee5a17a7667fe96afdde3d59a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 148 ++++++++------------ drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 10 +- drivers/gpu/drm/xe/xe_pt.c | 2 +- drivers/gpu/drm/xe/xe_vm.c | 30 ++-- 4 files changed, 80 insertions(+), 110 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index 92a18a0e4acd..c3419d4412ce 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -17,6 +17,8 @@ #include "xe_trace.h" #include "regs/xe_guc_regs.h" +#define FENCE_STACK_BIT DMA_FENCE_FLAG_USER_BITS + /* * TLB inval depends on pending commands in the CT queue and then the real * invalidation time. Double up the time to process full CT queue @@ -33,6 +35,23 @@ static long tlb_timeout_jiffies(struct xe_gt *gt) return hw_tlb_timeout + 2 * delay; } +static void +__invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) +{ + bool stack = test_bit(FENCE_STACK_BIT, &fence->base.flags); + + trace_xe_gt_tlb_invalidation_fence_signal(xe, fence); + dma_fence_signal(&fence->base); + if (!stack) + dma_fence_put(&fence->base); +} + +static void +invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) +{ + list_del(&fence->link); + __invalidation_fence_signal(xe, fence); +} static void xe_gt_tlb_fence_timeout(struct work_struct *work) { @@ -54,10 +73,8 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work) xe_gt_err(gt, "TLB invalidation fence timeout, seqno=%d recv=%d", fence->seqno, gt->tlb_invalidation.seqno_recv); - list_del(&fence->link); fence->base.error = -ETIME; - dma_fence_signal(&fence->base); - dma_fence_put(&fence->base); + invalidation_fence_signal(xe, fence); } if (!list_empty(>->tlb_invalidation.pending_fences)) queue_delayed_work(system_wq, @@ -87,21 +104,6 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt) return 0; } -static void -__invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) -{ - trace_xe_gt_tlb_invalidation_fence_signal(xe, fence); - dma_fence_signal(&fence->base); - dma_fence_put(&fence->base); -} - -static void -invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) -{ - list_del(&fence->link); - __invalidation_fence_signal(xe, fence); -} - /** * xe_gt_tlb_invalidation_reset - Initialize GT TLB invalidation reset * @gt: graphics tile @@ -111,7 +113,6 @@ invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fe void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) { struct xe_gt_tlb_invalidation_fence *fence, *next; - struct xe_guc *guc = >->uc.guc; int pending_seqno; /* @@ -134,7 +135,6 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) else pending_seqno = gt->tlb_invalidation.seqno - 1; WRITE_ONCE(gt->tlb_invalidation.seqno_recv, pending_seqno); - wake_up_all(&guc->ct.wq); list_for_each_entry_safe(fence, next, >->tlb_invalidation.pending_fences, link) @@ -165,6 +165,8 @@ static int send_tlb_invalidation(struct xe_guc *guc, int seqno; int ret; + xe_gt_assert(gt, fence); + /* * XXX: The seqno algorithm relies on TLB invalidation being processed * in order which they currently are, if that changes the algorithm will @@ -173,10 +175,8 @@ static int send_tlb_invalidation(struct xe_guc *guc, mutex_lock(&guc->ct.lock); seqno = gt->tlb_invalidation.seqno; - if (fence) { - fence->seqno = seqno; - trace_xe_gt_tlb_invalidation_fence_send(xe, fence); - } + fence->seqno = seqno; + trace_xe_gt_tlb_invalidation_fence_send(xe, fence); action[1] = seqno; ret = xe_guc_ct_send_locked(&guc->ct, action, len, G2H_LEN_DW_TLB_INVALIDATE, 1); @@ -209,7 +209,6 @@ static int send_tlb_invalidation(struct xe_guc *guc, TLB_INVALIDATION_SEQNO_MAX; if (!gt->tlb_invalidation.seqno) gt->tlb_invalidation.seqno = 1; - ret = seqno; } mutex_unlock(&guc->ct.lock); @@ -223,14 +222,16 @@ static int send_tlb_invalidation(struct xe_guc *guc, /** * xe_gt_tlb_invalidation_guc - Issue a TLB invalidation on this GT for the GuC * @gt: graphics tile + * @fence: invalidation fence which will be signal on TLB invalidation + * completion * * Issue a TLB invalidation for the GuC. Completion of TLB is asynchronous and - * caller can use seqno + xe_gt_tlb_invalidation_wait to wait for completion. + * caller can use the invalidation fence to wait for completion. * - * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, - * negative error code on error. + * Return: 0 on success, negative error code on error */ -static int xe_gt_tlb_invalidation_guc(struct xe_gt *gt) +static int xe_gt_tlb_invalidation_guc(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence) { u32 action[] = { XE_GUC_ACTION_TLB_INVALIDATION, @@ -238,7 +239,7 @@ static int xe_gt_tlb_invalidation_guc(struct xe_gt *gt) MAKE_INVAL_OP(XE_GUC_TLB_INVAL_GUC), }; - return send_tlb_invalidation(>->uc.guc, NULL, action, + return send_tlb_invalidation(>->uc.guc, fence, action, ARRAY_SIZE(action)); } @@ -257,13 +258,15 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt) if (xe_guc_ct_enabled(>->uc.guc.ct) && gt->uc.guc.submission_state.enabled) { - int seqno; + struct xe_gt_tlb_invalidation_fence fence; + int ret; - seqno = xe_gt_tlb_invalidation_guc(gt); - if (seqno <= 0) - return seqno; + xe_gt_tlb_invalidation_fence_init(gt, &fence, true); + ret = xe_gt_tlb_invalidation_guc(gt, &fence); + if (ret < 0) + return ret; - xe_gt_tlb_invalidation_wait(gt, seqno); + xe_gt_tlb_invalidation_fence_wait(&fence); } else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) { if (IS_SRIOV_VF(xe)) return 0; @@ -290,18 +293,16 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt) * * @gt: graphics tile * @fence: invalidation fence which will be signal on TLB invalidation - * completion, can be NULL + * completion * @start: start address * @end: end address * @asid: address space id * * Issue a range based TLB invalidation if supported, if not fallback to a full - * TLB invalidation. Completion of TLB is asynchronous and caller can either use - * the invalidation fence or seqno + xe_gt_tlb_invalidation_wait to wait for - * completion. + * TLB invalidation. Completion of TLB is asynchronous and caller can use + * the invalidation fence to wait for completion. * - * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, - * negative error code on error. + * Return: Negative error code on error, 0 on success */ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, @@ -312,11 +313,11 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, u32 action[MAX_TLB_INVALIDATION_LEN]; int len = 0; + xe_gt_assert(gt, fence); + /* Execlists not supported */ if (gt_to_xe(gt)->info.force_execlist) { - if (fence) - __invalidation_fence_signal(xe, fence); - + __invalidation_fence_signal(xe, fence); return 0; } @@ -382,12 +383,10 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, * @vma: VMA to invalidate * * Issue a range based TLB invalidation if supported, if not fallback to a full - * TLB invalidation. Completion of TLB is asynchronous and caller can either use - * the invalidation fence or seqno + xe_gt_tlb_invalidation_wait to wait for - * completion. + * TLB invalidation. Completion of TLB is asynchronous and caller can use + * the invalidation fence to wait for completion. * - * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, - * negative error code on error. + * Return: Negative error code on error, 0 on success */ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, @@ -400,43 +399,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, xe_vma_vm(vma)->usm.asid); } -/** - * xe_gt_tlb_invalidation_wait - Wait for TLB to complete - * @gt: graphics tile - * @seqno: seqno to wait which was returned from xe_gt_tlb_invalidation - * - * Wait for tlb_timeout_jiffies() for a TLB invalidation to complete. - * - * Return: 0 on success, -ETIME on TLB invalidation timeout - */ -int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno) -{ - struct xe_guc *guc = >->uc.guc; - int ret; - - /* Execlists not supported */ - if (gt_to_xe(gt)->info.force_execlist) - return 0; - - /* - * XXX: See above, this algorithm only works if seqno are always in - * order - */ - ret = wait_event_timeout(guc->ct.wq, - tlb_invalidation_seqno_past(gt, seqno), - tlb_timeout_jiffies(gt)); - if (!ret) { - struct drm_printer p = xe_gt_err_printer(gt); - - xe_gt_err(gt, "TLB invalidation time'd out, seqno=%d, recv=%d\n", - seqno, gt->tlb_invalidation.seqno_recv); - xe_guc_ct_print(&guc->ct, &p, true); - return -ETIME; - } - - return 0; -} - /** * xe_guc_tlb_invalidation_done_handler - TLB invalidation done handler * @guc: guc @@ -480,12 +442,7 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len) return 0; } - /* - * wake_up_all() and wait_event_timeout() already have the correct - * barriers. - */ WRITE_ONCE(gt->tlb_invalidation.seqno_recv, msg[0]); - wake_up_all(&guc->ct.wq); list_for_each_entry_safe(fence, next, >->tlb_invalidation.pending_fences, link) { @@ -530,11 +487,13 @@ static const struct dma_fence_ops invalidation_fence_ops = { * xe_gt_tlb_invalidation_fence_init - Initialize TLB invalidation fence * @gt: GT * @fence: TLB invalidation fence to initialize + * @stack: fence is stack variable * * Initialize TLB invalidation fence for use */ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, - struct xe_gt_tlb_invalidation_fence *fence) + struct xe_gt_tlb_invalidation_fence *fence, + bool stack) { spin_lock_irq(>->tlb_invalidation.lock); dma_fence_init(&fence->base, &invalidation_fence_ops, @@ -542,5 +501,8 @@ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, dma_fence_context_alloc(1), 1); spin_unlock_irq(>->tlb_invalidation.lock); INIT_LIST_HEAD(&fence->link); - dma_fence_get(&fence->base); + if (stack) + set_bit(FENCE_STACK_BIT, &fence->base.flags); + else + dma_fence_get(&fence->base); } diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index 948f4a2f5214..f430d5797af7 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -23,10 +23,16 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, int xe_gt_tlb_invalidation_range(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, u64 start, u64 end, u32 asid); -int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno); int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, - struct xe_gt_tlb_invalidation_fence *fence); + struct xe_gt_tlb_invalidation_fence *fence, + bool stack); + +static inline void +xe_gt_tlb_invalidation_fence_wait(struct xe_gt_tlb_invalidation_fence *fence) +{ + dma_fence_wait(&fence->base, false); +} #endif /* _XE_GT_TLB_INVALIDATION_ */ diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 4c17a1ec8f8b..31a751a5de3f 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1153,7 +1153,7 @@ static int invalidation_fence_init(struct xe_gt *gt, trace_xe_gt_tlb_invalidation_fence_create(gt_to_xe(gt), &ifence->base); - xe_gt_tlb_invalidation_fence_init(gt, &ifence->base); + xe_gt_tlb_invalidation_fence_init(gt, &ifence->base, false); ifence->fence = fence; ifence->gt = gt; diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 6bfcbd4e778a..931935ec33db 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3341,10 +3341,10 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) { struct xe_device *xe = xe_vma_vm(vma)->xe; struct xe_tile *tile; + struct xe_gt_tlb_invalidation_fence fence[XE_MAX_TILES_PER_DEVICE]; u32 tile_needs_invalidate = 0; - int seqno[XE_MAX_TILES_PER_DEVICE]; u8 id; - int ret; + int ret = 0; xe_assert(xe, !xe_vma_is_null(vma)); trace_xe_vma_invalidate(vma); @@ -3369,29 +3369,31 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) for_each_tile(tile, xe, id) { if (xe_pt_zap_ptes(tile, vma)) { - tile_needs_invalidate |= BIT(id); xe_device_wmb(xe); + xe_gt_tlb_invalidation_fence_init(tile->primary_gt, + &fence[id], true); + /* * FIXME: We potentially need to invalidate multiple * GTs within the tile */ - seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma); - if (seqno[id] < 0) - return seqno[id]; + ret = xe_gt_tlb_invalidation_vma(tile->primary_gt, + &fence[id], vma); + if (ret < 0) + goto wait; + + tile_needs_invalidate |= BIT(id); } } - for_each_tile(tile, xe, id) { - if (tile_needs_invalidate & BIT(id)) { - ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]); - if (ret < 0) - return ret; - } - } +wait: + for_each_tile(tile, xe, id) + if (tile_needs_invalidate & BIT(id)) + xe_gt_tlb_invalidation_fence_wait(&fence[id]); vma->tile_invalidated = vma->tile_mask; - return 0; + return ret; } struct xe_vm_snapshot { From f002702290fccbd473f5bb94e52f25c96917fff2 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:04 -0700 Subject: [PATCH 0919/1052] drm/xe: Hold a PM ref when GT TLB invalidations are inflight Avoid GT TLB invalidation timeouts by holding a PM ref when invalidations are inflight. v2: - Drop PM ref before signaling fence (CI) v3: - Move invalidation_fence_signal helper in tlb timeout to previous patch (Matthew Auld) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Rodrigo Vivi Cc: Nirmoy Das Signed-off-by: Matthew Brost Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-4-matthew.brost@intel.com (cherry picked from commit 0a382f9bc5dc4744a33970a5ed4df8f9c702ee9e) Requires: 46209ce5287b ("drm/xe: Add xe_gt_tlb_invalidation_fence_init helper") Requires: 0e414ab036e0 ("drm/xe: Drop xe_gt_tlb_invalidation_wait") Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 23 +++++++++++++++++-- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 1 + .../gpu/drm/xe/xe_gt_tlb_invalidation_types.h | 4 ++++ drivers/gpu/drm/xe/xe_vm.c | 4 +++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index c3419d4412ce..481d83d07367 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -13,6 +13,7 @@ #include "xe_guc.h" #include "xe_guc_ct.h" #include "xe_mmio.h" +#include "xe_pm.h" #include "xe_sriov.h" #include "xe_trace.h" #include "regs/xe_guc_regs.h" @@ -41,6 +42,7 @@ __invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_ bool stack = test_bit(FENCE_STACK_BIT, &fence->base.flags); trace_xe_gt_tlb_invalidation_fence_signal(xe, fence); + xe_gt_tlb_invalidation_fence_fini(fence); dma_fence_signal(&fence->base); if (!stack) dma_fence_put(&fence->base); @@ -263,8 +265,10 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt) xe_gt_tlb_invalidation_fence_init(gt, &fence, true); ret = xe_gt_tlb_invalidation_guc(gt, &fence); - if (ret < 0) + if (ret < 0) { + xe_gt_tlb_invalidation_fence_fini(&fence); return ret; + } xe_gt_tlb_invalidation_fence_wait(&fence); } else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) { @@ -489,12 +493,15 @@ static const struct dma_fence_ops invalidation_fence_ops = { * @fence: TLB invalidation fence to initialize * @stack: fence is stack variable * - * Initialize TLB invalidation fence for use + * Initialize TLB invalidation fence for use. xe_gt_tlb_invalidation_fence_fini + * must be called if fence is not signaled. */ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, bool stack) { + xe_pm_runtime_get_noresume(gt_to_xe(gt)); + spin_lock_irq(>->tlb_invalidation.lock); dma_fence_init(&fence->base, &invalidation_fence_ops, >->tlb_invalidation.lock, @@ -505,4 +512,16 @@ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, set_bit(FENCE_STACK_BIT, &fence->base.flags); else dma_fence_get(&fence->base); + fence->gt = gt; +} + +/** + * xe_gt_tlb_invalidation_fence_fini - Finalize TLB invalidation fence + * @fence: TLB invalidation fence to finalize + * + * Drop PM ref which fence took durinig init. + */ +void xe_gt_tlb_invalidation_fence_fini(struct xe_gt_tlb_invalidation_fence *fence) +{ + xe_pm_runtime_put(gt_to_xe(fence->gt)); } diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index f430d5797af7..a84065fa324c 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -28,6 +28,7 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, bool stack); +void xe_gt_tlb_invalidation_fence_fini(struct xe_gt_tlb_invalidation_fence *fence); static inline void xe_gt_tlb_invalidation_fence_wait(struct xe_gt_tlb_invalidation_fence *fence) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h index 934c828efe31..de6e825e0851 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h @@ -8,6 +8,8 @@ #include +struct xe_gt; + /** * struct xe_gt_tlb_invalidation_fence - XE GT TLB invalidation fence * @@ -17,6 +19,8 @@ struct xe_gt_tlb_invalidation_fence { /** @base: dma fence base */ struct dma_fence base; + /** @gt: GT which fence belong to */ + struct xe_gt *gt; /** @link: link into list of pending tlb fences */ struct list_head link; /** @seqno: seqno of TLB invalidation to signal fence one */ diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 931935ec33db..c7561a56abaf 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3379,8 +3379,10 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) */ ret = xe_gt_tlb_invalidation_vma(tile->primary_gt, &fence[id], vma); - if (ret < 0) + if (ret < 0) { + xe_gt_tlb_invalidation_fence_fini(&fence[id]); goto wait; + } tile_needs_invalidate |= BIT(id); } From af8e119f52e9c13e556be9e03f27957554a84656 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 15 Aug 2024 17:11:17 +0300 Subject: [PATCH 0920/1052] xhci: Fix Panther point NULL pointer deref at full-speed re-enumeration re-enumerating full-speed devices after a failed address device command can trigger a NULL pointer dereference. Full-speed devices may need to reconfigure the endpoint 0 Max Packet Size value during enumeration. Usb core calls usb_ep0_reinit() in this case, which ends up calling xhci_configure_endpoint(). On Panther point xHC the xhci_configure_endpoint() function will additionally check and reserve bandwidth in software. Other hosts do this in hardware If xHC address device command fails then a new xhci_virt_device structure is allocated as part of re-enabling the slot, but the bandwidth table pointers are not set up properly here. This triggers the NULL pointer dereference the next time usb_ep0_reinit() is called and xhci_configure_endpoint() tries to check and reserve bandwidth [46710.713538] usb 3-1: new full-speed USB device number 5 using xhci_hcd [46710.713699] usb 3-1: Device not responding to setup address. [46710.917684] usb 3-1: Device not responding to setup address. [46711.125536] usb 3-1: device not accepting address 5, error -71 [46711.125594] BUG: kernel NULL pointer dereference, address: 0000000000000008 [46711.125600] #PF: supervisor read access in kernel mode [46711.125603] #PF: error_code(0x0000) - not-present page [46711.125606] PGD 0 P4D 0 [46711.125610] Oops: Oops: 0000 [#1] PREEMPT SMP PTI [46711.125615] CPU: 1 PID: 25760 Comm: kworker/1:2 Not tainted 6.10.3_2 #1 [46711.125620] Hardware name: Gigabyte Technology Co., Ltd. [46711.125623] Workqueue: usb_hub_wq hub_event [usbcore] [46711.125668] RIP: 0010:xhci_reserve_bandwidth (drivers/usb/host/xhci.c Fix this by making sure bandwidth table pointers are set up correctly after a failed address device command, and additionally by avoiding checking for bandwidth in cases like this where no actual endpoints are added or removed, i.e. only context for default control endpoint 0 is evaluated. Reported-by: Karel Balej Closes: https://lore.kernel.org/linux-usb/D3CKQQAETH47.1MUO22RTCH2O3@matfyz.cz/ Cc: stable@vger.kernel.org Fixes: 651aaf36a7d7 ("usb: xhci: Handle USB transaction error on address command") Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240815141117.2702314-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 0a8cf6c17f82..efdf4c228b8c 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -2837,7 +2837,7 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci, xhci->num_active_eps); return -ENOMEM; } - if ((xhci->quirks & XHCI_SW_BW_CHECKING) && + if ((xhci->quirks & XHCI_SW_BW_CHECKING) && !ctx_change && xhci_reserve_bandwidth(xhci, virt_dev, command->in_ctx)) { if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK)) xhci_free_host_resources(xhci, ctrl_ctx); @@ -4200,8 +4200,10 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev, mutex_unlock(&xhci->mutex); ret = xhci_disable_slot(xhci, udev->slot_id); xhci_free_virt_device(xhci, udev->slot_id); - if (!ret) - xhci_alloc_dev(hcd, udev); + if (!ret) { + if (xhci_alloc_dev(hcd, udev) == 1) + xhci_setup_addressable_virt_dev(xhci, udev); + } kfree(command->completion); kfree(command); return -EPROTO; From 9bb5e74b2bf88fbb024bb15ded3b011e02c673be Mon Sep 17 00:00:00 2001 From: Griffin Kroah-Hartman Date: Thu, 15 Aug 2024 11:49:20 +0200 Subject: [PATCH 0921/1052] Revert "misc: fastrpc: Restrict untrusted app to attach to privileged PD" This reverts commit bab2f5e8fd5d2f759db26b78d9db57412888f187. Joel reported that this commit breaks userspace and stops sensors in SDM845 from working. Also breaks other qcom SoC devices running postmarketOS. Cc: stable Cc: Ekansh Gupta Cc: Dmitry Baryshkov Reported-by: Joel Selvaraj Link: https://lore.kernel.org/r/9a9f5646-a554-4b65-8122-d212bb665c81@umsystem.edu Signed-off-by: Griffin Kroah-Hartman Acked-by: Srinivas Kandagatla Fixes: bab2f5e8fd5d ("misc: fastrpc: Restrict untrusted app to attach to privileged PD") Link: https://lore.kernel.org/r/20240815094920.8242-1-griffin@kroah.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 22 +++------------------- include/uapi/misc/fastrpc.h | 3 --- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 5204fda51da3..339d126414d4 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -2085,16 +2085,6 @@ static int fastrpc_req_mem_map(struct fastrpc_user *fl, char __user *argp) return err; } -static int is_attach_rejected(struct fastrpc_user *fl) -{ - /* Check if the device node is non-secure */ - if (!fl->is_secure_dev) { - dev_dbg(&fl->cctx->rpdev->dev, "untrusted app trying to attach to privileged DSP PD\n"); - return -EACCES; - } - return 0; -} - static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2107,19 +2097,13 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, err = fastrpc_invoke(fl, argp); break; case FASTRPC_IOCTL_INIT_ATTACH: - err = is_attach_rejected(fl); - if (!err) - err = fastrpc_init_attach(fl, ROOT_PD); + err = fastrpc_init_attach(fl, ROOT_PD); break; case FASTRPC_IOCTL_INIT_ATTACH_SNS: - err = is_attach_rejected(fl); - if (!err) - err = fastrpc_init_attach(fl, SENSORS_PD); + err = fastrpc_init_attach(fl, SENSORS_PD); break; case FASTRPC_IOCTL_INIT_CREATE_STATIC: - err = is_attach_rejected(fl); - if (!err) - err = fastrpc_init_create_static_process(fl, argp); + err = fastrpc_init_create_static_process(fl, argp); break; case FASTRPC_IOCTL_INIT_CREATE: err = fastrpc_init_create_process(fl, argp); diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index 91583690bddc..f33d914d8f46 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -8,14 +8,11 @@ #define FASTRPC_IOCTL_ALLOC_DMA_BUFF _IOWR('R', 1, struct fastrpc_alloc_dma_buf) #define FASTRPC_IOCTL_FREE_DMA_BUFF _IOWR('R', 2, __u32) #define FASTRPC_IOCTL_INVOKE _IOWR('R', 3, struct fastrpc_invoke) -/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_ATTACH _IO('R', 4) #define FASTRPC_IOCTL_INIT_CREATE _IOWR('R', 5, struct fastrpc_init_create) #define FASTRPC_IOCTL_MMAP _IOWR('R', 6, struct fastrpc_req_mmap) #define FASTRPC_IOCTL_MUNMAP _IOWR('R', 7, struct fastrpc_req_munmap) -/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_ATTACH_SNS _IO('R', 8) -/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_CREATE_STATIC _IOWR('R', 9, struct fastrpc_init_create_static) #define FASTRPC_IOCTL_MEM_MAP _IOWR('R', 10, struct fastrpc_mem_map) #define FASTRPC_IOCTL_MEM_UNMAP _IOWR('R', 11, struct fastrpc_mem_unmap) From 92e9bac18124682c4b99ede9ee3bcdd68f121e92 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 15 Aug 2024 01:04:31 +0100 Subject: [PATCH 0922/1052] kunit/overflow: Fix UB in overflow_allocation_test The 'device_name' array doesn't exist out of the 'overflow_allocation_test' function scope. However, it is being used as a driver name when calling 'kunit_driver_create' from 'kunit_device_register'. It produces the kernel panic with KASAN enabled. Since this variable is used in one place only, remove it and pass the device name into kunit_device_register directly as an ascii string. Signed-off-by: Ivan Orlov Reviewed-by: David Gow Link: https://lore.kernel.org/r/20240815000431.401869-1-ivan.orlov0322@gmail.com Signed-off-by: Kees Cook --- lib/overflow_kunit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index f314a0c15a6d..2abc78367dd1 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -668,7 +668,6 @@ DEFINE_TEST_ALLOC(devm_kzalloc, devm_kfree, 1, 1, 0); static void overflow_allocation_test(struct kunit *test) { - const char device_name[] = "overflow-test"; struct device *dev; int count = 0; @@ -678,7 +677,7 @@ static void overflow_allocation_test(struct kunit *test) } while (0) /* Create dummy device for devm_kmalloc()-family tests. */ - dev = kunit_device_register(test, device_name); + dev = kunit_device_register(test, "overflow-test"); KUNIT_ASSERT_FALSE_MSG(test, IS_ERR(dev), "Cannot register test device\n"); From 020925ce92990c3bf59ab2cde386ac6d9ec734ff Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 7 Aug 2024 15:05:12 -0700 Subject: [PATCH 0923/1052] kallsyms: Do not cleanup .llvm. suffix before sorting symbols Cleaning up the symbols causes various issues afterwards. Let's sort the list based on original name. Signed-off-by: Song Liu Fixes: 8cc32a9bbf29 ("kallsyms: strip LTO-only suffixes from promoted global functions") Reviewed-by: Masami Hiramatsu (Google) Tested-by: Masami Hiramatsu (Google) Acked-by: Petr Mladek Reviewed-by: Sami Tolvanen Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240807220513.3100483-2-song@kernel.org Signed-off-by: Kees Cook --- scripts/kallsyms.c | 31 ++----------------------------- scripts/link-vmlinux.sh | 4 ---- 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 0ed873491bf5..123dab0572f8 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -5,8 +5,7 @@ * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. * - * Usage: kallsyms [--all-symbols] [--absolute-percpu] - * [--lto-clang] in.map > out.S + * Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S * * Table compression uses all the unused char codes on the symbols and * maps these to the most used substrings (tokens). For instance, it might @@ -62,7 +61,6 @@ static struct sym_entry **table; static unsigned int table_size, table_cnt; static int all_symbols; static int absolute_percpu; -static int lto_clang; static int token_profit[0x10000]; @@ -73,8 +71,7 @@ static unsigned char best_table_len[256]; static void usage(void) { - fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] " - "[--lto-clang] in.map > out.S\n"); + fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S\n"); exit(1); } @@ -344,25 +341,6 @@ static bool symbol_absolute(const struct sym_entry *s) return s->percpu_absolute; } -static void cleanup_symbol_name(char *s) -{ - char *p; - - /* - * ASCII[.] = 2e - * ASCII[0-9] = 30,39 - * ASCII[A-Z] = 41,5a - * ASCII[_] = 5f - * ASCII[a-z] = 61,7a - * - * As above, replacing the first '.' in ".llvm." with '\0' does not - * affect the main sorting, but it helps us with subsorting. - */ - p = strstr(s, ".llvm."); - if (p) - *p = '\0'; -} - static int compare_names(const void *a, const void *b) { int ret; @@ -526,10 +504,6 @@ static void write_src(void) output_address(relative_base); printf("\n"); - if (lto_clang) - for (i = 0; i < table_cnt; i++) - cleanup_symbol_name((char *)table[i]->sym); - sort_symbols_by_name(); output_label("kallsyms_seqs_of_names"); for (i = 0; i < table_cnt; i++) @@ -807,7 +781,6 @@ int main(int argc, char **argv) static const struct option long_options[] = { {"all-symbols", no_argument, &all_symbols, 1}, {"absolute-percpu", no_argument, &absolute_percpu, 1}, - {"lto-clang", no_argument, <o_clang, 1}, {}, }; diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index f7b2503cdba9..22d0bc843986 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -156,10 +156,6 @@ kallsyms() kallsymopt="${kallsymopt} --absolute-percpu" fi - if is_enabled CONFIG_LTO_CLANG; then - kallsymopt="${kallsymopt} --lto-clang" - fi - info KSYMS "${2}.S" scripts/kallsyms ${kallsymopt} "${1}" > "${2}.S" From fb6a421fb6153d97cf3058f9bd550b377b76a490 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 7 Aug 2024 15:05:13 -0700 Subject: [PATCH 0924/1052] kallsyms: Match symbols exactly with CONFIG_LTO_CLANG With CONFIG_LTO_CLANG=y, the compiler may add .llvm. suffix to function names to avoid duplication. APIs like kallsyms_lookup_name() and kallsyms_on_each_match_symbol() tries to match these symbol names without the .llvm. suffix, e.g., match "c_stop" with symbol c_stop.llvm.17132674095431275852. This turned out to be problematic for use cases that require exact match, for example, livepatch. Fix this by making the APIs to match symbols exactly. Also cleanup kallsyms_selftests accordingly. Signed-off-by: Song Liu Fixes: 8cc32a9bbf29 ("kallsyms: strip LTO-only suffixes from promoted global functions") Tested-by: Masami Hiramatsu (Google) Reviewed-by: Masami Hiramatsu (Google) Acked-by: Petr Mladek Reviewed-by: Sami Tolvanen Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240807220513.3100483-3-song@kernel.org Signed-off-by: Kees Cook --- kernel/kallsyms.c | 55 +++++--------------------------------- kernel/kallsyms_selftest.c | 22 +-------------- 2 files changed, 7 insertions(+), 70 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fb2c77368d18..a9a0ca605d4a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -160,38 +160,6 @@ unsigned long kallsyms_sym_address(int idx) return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } -static void cleanup_symbol_name(char *s) -{ - char *res; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return; - - /* - * LLVM appends various suffixes for local functions and variables that - * must be promoted to global scope as part of LTO. This can break - * hooking of static functions with kprobes. '.' is not a valid - * character in an identifier in C. Suffixes only in LLVM LTO observed: - * - foo.llvm.[0-9a-f]+ - */ - res = strstr(s, ".llvm."); - if (res) - *res = '\0'; - - return; -} - -static int compare_symbol_name(const char *name, char *namebuf) -{ - /* The kallsyms_seqs_of_names is sorted based on names after - * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. - * To ensure correct bisection in kallsyms_lookup_names(), do - * cleanup_symbol_name(namebuf) before comparing name and namebuf. - */ - cleanup_symbol_name(namebuf); - return strcmp(name, namebuf); -} - static unsigned int get_symbol_seq(int index) { unsigned int i, seq = 0; @@ -219,7 +187,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(mid); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = compare_symbol_name(name, namebuf); + ret = strcmp(name, namebuf); if (ret > 0) low = mid + 1; else if (ret < 0) @@ -236,7 +204,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(low - 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; low--; } @@ -248,7 +216,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(high + 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; high++; } @@ -407,8 +375,7 @@ static int kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = strlen(namebuf); - goto found; + return strlen(namebuf); } /* See if it's in a module or a BPF JITed image. */ @@ -422,8 +389,6 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); -found: - cleanup_symbol_name(namebuf); return ret; } @@ -450,8 +415,6 @@ const char *kallsyms_lookup(unsigned long addr, int lookup_symbol_name(unsigned long addr, char *symname) { - int res; - symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; @@ -462,16 +425,10 @@ int lookup_symbol_name(unsigned long addr, char *symname) /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); - goto found; + return 0; } /* See if it's in a module. */ - res = lookup_module_symbol_name(addr, symname); - if (res) - return res; - -found: - cleanup_symbol_name(symname); - return 0; + return lookup_module_symbol_name(addr, symname); } /* Look up a kernel symbol and return it in a text buffer. */ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 2f84896a7bcb..873f7c445488 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -187,31 +187,11 @@ static void test_perf_kallsyms_lookup_name(void) stat.min, stat.max, div_u64(stat.sum, stat.real_cnt)); } -static bool match_cleanup_name(const char *s, const char *name) -{ - char *p; - int len; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return false; - - p = strstr(s, ".llvm."); - if (!p) - return false; - - len = strlen(name); - if (p - s != len) - return false; - - return !strncmp(s, name, len); -} - static int find_symbol(void *data, const char *name, unsigned long addr) { struct test_stat *stat = (struct test_stat *)data; - if (strcmp(name, stat->name) == 0 || - (!stat->perf && match_cleanup_name(name, stat->name))) { + if (!strcmp(name, stat->name)) { stat->real_cnt++; stat->addr = addr; From f52403b6bfea6994b017c58367aee9acdb9d9fd4 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:49 -0700 Subject: [PATCH 0925/1052] selftests/bpf: Add traffic monitor functions. Add functions that capture packets and print log in the background. They are supposed to be used for debugging flaky network test cases. A monitored test case should call traffic_monitor_start() to start a thread to capture packets in the background for a given namespace and call traffic_monitor_stop() to stop capturing. (Or, option '-m' implemented by the later patches.) lo In IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 68, SYN lo In IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 60, SYN, ACK lo In IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 60, ACK lo In IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 52, ACK lo In IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 52, FIN, ACK lo In IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 52, RST, ACK Packet file: packets-2173-86-select_reuseport:sockhash_IPv4_TCP_LOOPBACK_test_detach_bpf-test.log #280/87 select_reuseport/sockhash IPv4/TCP LOOPBACK test_detach_bpf:OK The above is the output of an example. It shows the packets of a connection and the name of the file that contains captured packets in the directory /tmp/tmon_pcap. The file can be loaded by tcpdump or wireshark. This feature only works if libpcap is available. (Could be found by pkg-config) Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-2-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 4 + tools/testing/selftests/bpf/network_helpers.c | 454 ++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 18 + 3 files changed, 476 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 163b11014158..4eceb491a8ae 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -41,6 +41,10 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic \ LDFLAGS += $(SAN_LDFLAGS) LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread +LDLIBS += $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) +CFLAGS += $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null) +CFLAGS += $(shell $(PKG_CONFIG) --exists libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") + # The following tests perform type punning and they may break strict # aliasing rules, which are exploited by both GCC and clang by default # while optimizing. This can lead to broken programs. diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index a3f0a49fb26f..e0b0a69f066e 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -11,17 +11,31 @@ #include #include #include +#include #include +#include #include #include #include #include +#include +#include +#include +#include + #include "bpf_util.h" #include "network_helpers.h" #include "test_progs.h" +#ifdef TRAFFIC_MONITOR +/* Prevent pcap.h from including pcap/bpf.h and causing conflicts */ +#define PCAP_DONT_INCLUDE_PCAP_BPF_H 1 +#include +#include +#endif + #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 #endif @@ -660,3 +674,443 @@ int send_recv_data(int lfd, int fd, uint32_t total_bytes) return err; } + +#ifdef TRAFFIC_MONITOR +struct tmonitor_ctx { + pcap_t *pcap; + pcap_dumper_t *dumper; + pthread_t thread; + int wake_fd; + + volatile bool done; + char pkt_fname[PATH_MAX]; + int pcap_fd; +}; + +/* Is this packet captured with a Ethernet protocol type? */ +static bool is_ethernet(const u_char *packet) +{ + u16 arphdr_type; + + memcpy(&arphdr_type, packet + 8, 2); + arphdr_type = ntohs(arphdr_type); + + /* Except the following cases, the protocol type contains the + * Ethernet protocol type for the packet. + * + * https://www.tcpdump.org/linktypes/LINKTYPE_LINUX_SLL2.html + */ + switch (arphdr_type) { + case 770: /* ARPHRD_FRAD */ + case 778: /* ARPHDR_IPGRE */ + case 803: /* ARPHRD_IEEE80211_RADIOTAP */ + printf("Packet captured: arphdr_type=%d\n", arphdr_type); + return false; + } + return true; +} + +static const char * const pkt_types[] = { + "In", + "B", /* Broadcast */ + "M", /* Multicast */ + "C", /* Captured with the promiscuous mode */ + "Out", +}; + +static const char *pkt_type_str(u16 pkt_type) +{ + if (pkt_type < ARRAY_SIZE(pkt_types)) + return pkt_types[pkt_type]; + return "Unknown"; +} + +/* Show the information of the transport layer in the packet */ +static void show_transport(const u_char *packet, u16 len, u32 ifindex, + const char *src_addr, const char *dst_addr, + u16 proto, bool ipv6, u8 pkt_type) +{ + char *ifname, _ifname[IF_NAMESIZE]; + const char *transport_str; + u16 src_port, dst_port; + struct udphdr *udp; + struct tcphdr *tcp; + + ifname = if_indextoname(ifindex, _ifname); + if (!ifname) { + snprintf(_ifname, sizeof(_ifname), "unknown(%d)", ifindex); + ifname = _ifname; + } + + if (proto == IPPROTO_UDP) { + udp = (struct udphdr *)packet; + src_port = ntohs(udp->source); + dst_port = ntohs(udp->dest); + transport_str = "UDP"; + } else if (proto == IPPROTO_TCP) { + tcp = (struct tcphdr *)packet; + src_port = ntohs(tcp->source); + dst_port = ntohs(tcp->dest); + transport_str = "TCP"; + } else if (proto == IPPROTO_ICMP) { + printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); + return; + } else if (proto == IPPROTO_ICMPV6) { + printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); + return; + } else { + printf("%-7s %-3s %s %s > %s: protocol %d\n", + ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", + src_addr, dst_addr, proto); + return; + } + + /* TCP or UDP*/ + + flockfile(stdout); + if (ipv6) + printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len); + else + printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len); + + if (proto == IPPROTO_TCP) { + if (tcp->fin) + printf(", FIN"); + if (tcp->syn) + printf(", SYN"); + if (tcp->rst) + printf(", RST"); + if (tcp->ack) + printf(", ACK"); + } + + printf("\n"); + funlockfile(stdout); +} + +static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +{ + char src_buf[INET6_ADDRSTRLEN], dst_buf[INET6_ADDRSTRLEN]; + struct ipv6hdr *pkt = (struct ipv6hdr *)packet; + const char *src, *dst; + u_char proto; + + src = inet_ntop(AF_INET6, &pkt->saddr, src_buf, sizeof(src_buf)); + if (!src) + src = ""; + dst = inet_ntop(AF_INET6, &pkt->daddr, dst_buf, sizeof(dst_buf)); + if (!dst) + dst = ""; + proto = pkt->nexthdr; + show_transport(packet + sizeof(struct ipv6hdr), + ntohs(pkt->payload_len), + ifindex, src, dst, proto, true, pkt_type); +} + +static void show_ipv4_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +{ + char src_buf[INET_ADDRSTRLEN], dst_buf[INET_ADDRSTRLEN]; + struct iphdr *pkt = (struct iphdr *)packet; + const char *src, *dst; + u_char proto; + + src = inet_ntop(AF_INET, &pkt->saddr, src_buf, sizeof(src_buf)); + if (!src) + src = ""; + dst = inet_ntop(AF_INET, &pkt->daddr, dst_buf, sizeof(dst_buf)); + if (!dst) + dst = ""; + proto = pkt->protocol; + show_transport(packet + sizeof(struct iphdr), + ntohs(pkt->tot_len), + ifindex, src, dst, proto, false, pkt_type); +} + +static void *traffic_monitor_thread(void *arg) +{ + char *ifname, _ifname[IF_NAMESIZE]; + const u_char *packet, *payload; + struct tmonitor_ctx *ctx = arg; + pcap_dumper_t *dumper = ctx->dumper; + int fd = ctx->pcap_fd, nfds, r; + int wake_fd = ctx->wake_fd; + struct pcap_pkthdr header; + pcap_t *pcap = ctx->pcap; + u32 ifindex; + fd_set fds; + u16 proto; + u8 ptype; + + nfds = (fd > wake_fd ? fd : wake_fd) + 1; + FD_ZERO(&fds); + + while (!ctx->done) { + FD_SET(fd, &fds); + FD_SET(wake_fd, &fds); + r = select(nfds, &fds, NULL, NULL, NULL); + if (!r) + continue; + if (r < 0) { + if (errno == EINTR) + continue; + log_err("Fail to select on pcap fd and wake fd"); + break; + } + + /* This instance of pcap is non-blocking */ + packet = pcap_next(pcap, &header); + if (!packet) + continue; + + /* According to the man page of pcap_dump(), first argument + * is the pcap_dumper_t pointer even it's argument type is + * u_char *. + */ + pcap_dump((u_char *)dumper, &header, packet); + + /* Not sure what other types of packets look like. Here, we + * parse only Ethernet and compatible packets. + */ + if (!is_ethernet(packet)) + continue; + + /* Skip SLL2 header + * https://www.tcpdump.org/linktypes/LINKTYPE_LINUX_SLL2.html + * + * Although the document doesn't mention that, the payload + * doesn't include the Ethernet header. The payload starts + * from the first byte of the network layer header. + */ + payload = packet + 20; + + memcpy(&proto, packet, 2); + proto = ntohs(proto); + memcpy(&ifindex, packet + 4, 4); + ifindex = ntohl(ifindex); + ptype = packet[10]; + + if (proto == ETH_P_IPV6) { + show_ipv6_packet(payload, ifindex, ptype); + } else if (proto == ETH_P_IP) { + show_ipv4_packet(payload, ifindex, ptype); + } else { + ifname = if_indextoname(ifindex, _ifname); + if (!ifname) { + snprintf(_ifname, sizeof(_ifname), "unknown(%d)", ifindex); + ifname = _ifname; + } + + printf("%-7s %-3s Unknown network protocol type 0x%x\n", + ifname, pkt_type_str(ptype), proto); + } + } + + return NULL; +} + +/* Prepare the pcap handle to capture packets. + * + * This pcap is non-blocking and immediate mode is enabled to receive + * captured packets as soon as possible. The snaplen is set to 1024 bytes + * to limit the size of captured content. The format of the link-layer + * header is set to DLT_LINUX_SLL2 to enable handling various link-layer + * technologies. + */ +static pcap_t *traffic_monitor_prepare_pcap(void) +{ + char errbuf[PCAP_ERRBUF_SIZE]; + pcap_t *pcap; + int r; + + /* Listen on all NICs in the namespace */ + pcap = pcap_create("any", errbuf); + if (!pcap) { + log_err("Failed to open pcap: %s", errbuf); + return NULL; + } + /* Limit the size of the packet (first N bytes) */ + r = pcap_set_snaplen(pcap, 1024); + if (r) { + log_err("Failed to set snaplen: %s", pcap_geterr(pcap)); + goto error; + } + /* To receive packets as fast as possible */ + r = pcap_set_immediate_mode(pcap, 1); + if (r) { + log_err("Failed to set immediate mode: %s", pcap_geterr(pcap)); + goto error; + } + r = pcap_setnonblock(pcap, 1, errbuf); + if (r) { + log_err("Failed to set nonblock: %s", errbuf); + goto error; + } + r = pcap_activate(pcap); + if (r) { + log_err("Failed to activate pcap: %s", pcap_geterr(pcap)); + goto error; + } + /* Determine the format of the link-layer header */ + r = pcap_set_datalink(pcap, DLT_LINUX_SLL2); + if (r) { + log_err("Failed to set datalink: %s", pcap_geterr(pcap)); + goto error; + } + + return pcap; +error: + pcap_close(pcap); + return NULL; +} + +static void encode_test_name(char *buf, size_t len, const char *test_name, const char *subtest_name) +{ + char *p; + + if (subtest_name) + snprintf(buf, len, "%s__%s", test_name, subtest_name); + else + snprintf(buf, len, "%s", test_name); + while ((p = strchr(buf, '/'))) + *p = '_'; + while ((p = strchr(buf, ' '))) + *p = '_'; +} + +#define PCAP_DIR "/tmp/tmon_pcap" + +/* Start to monitor the network traffic in the given network namespace. + * + * netns: the name of the network namespace to monitor. If NULL, the + * current network namespace is monitored. + * test_name: the name of the running test. + * subtest_name: the name of the running subtest if there is. It should be + * NULL if it is not a subtest. + * + * This function will start a thread to capture packets going through NICs + * in the give network namespace. + */ +struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name) +{ + struct nstoken *nstoken = NULL; + struct tmonitor_ctx *ctx; + char test_name_buf[64]; + static int tmon_seq; + int r; + + if (netns) { + nstoken = open_netns(netns); + if (!nstoken) + return NULL; + } + ctx = malloc(sizeof(*ctx)); + if (!ctx) { + log_err("Failed to malloc ctx"); + goto fail_ctx; + } + memset(ctx, 0, sizeof(*ctx)); + + encode_test_name(test_name_buf, sizeof(test_name_buf), test_name, subtest_name); + snprintf(ctx->pkt_fname, sizeof(ctx->pkt_fname), + PCAP_DIR "/packets-%d-%d-%s-%s.log", getpid(), tmon_seq++, + test_name_buf, netns ? netns : "unknown"); + + r = mkdir(PCAP_DIR, 0755); + if (r && errno != EEXIST) { + log_err("Failed to create " PCAP_DIR); + goto fail_pcap; + } + + ctx->pcap = traffic_monitor_prepare_pcap(); + if (!ctx->pcap) + goto fail_pcap; + ctx->pcap_fd = pcap_get_selectable_fd(ctx->pcap); + if (ctx->pcap_fd < 0) { + log_err("Failed to get pcap fd"); + goto fail_dumper; + } + + /* Create a packet file */ + ctx->dumper = pcap_dump_open(ctx->pcap, ctx->pkt_fname); + if (!ctx->dumper) { + log_err("Failed to open pcap dump: %s", ctx->pkt_fname); + goto fail_dumper; + } + + /* Create an eventfd to wake up the monitor thread */ + ctx->wake_fd = eventfd(0, 0); + if (ctx->wake_fd < 0) { + log_err("Failed to create eventfd"); + goto fail_eventfd; + } + + r = pthread_create(&ctx->thread, NULL, traffic_monitor_thread, ctx); + if (r) { + log_err("Failed to create thread"); + goto fail; + } + + close_netns(nstoken); + + return ctx; + +fail: + close(ctx->wake_fd); + +fail_eventfd: + pcap_dump_close(ctx->dumper); + unlink(ctx->pkt_fname); + +fail_dumper: + pcap_close(ctx->pcap); + +fail_pcap: + free(ctx); + +fail_ctx: + close_netns(nstoken); + + return NULL; +} + +static void traffic_monitor_release(struct tmonitor_ctx *ctx) +{ + pcap_close(ctx->pcap); + pcap_dump_close(ctx->dumper); + + close(ctx->wake_fd); + + free(ctx); +} + +/* Stop the network traffic monitor. + * + * ctx: the context returned by traffic_monitor_start() + */ +void traffic_monitor_stop(struct tmonitor_ctx *ctx) +{ + __u64 w = 1; + + if (!ctx) + return; + + /* Stop the monitor thread */ + ctx->done = true; + /* Wake up the background thread. */ + write(ctx->wake_fd, &w, sizeof(w)); + pthread_join(ctx->thread, NULL); + + printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); + + traffic_monitor_release(ctx); +} +#endif /* TRAFFIC_MONITOR */ diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index cce56955371f..0d032ae706c6 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -136,4 +136,22 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, return csum_fold((__u32)s); } +struct tmonitor_ctx; + +#ifdef TRAFFIC_MONITOR +struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name); +void traffic_monitor_stop(struct tmonitor_ctx *ctx); +#else +static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name) +{ + return NULL; +} + +static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx) +{ +} +#endif + #endif From f5281aacec856e7e0beb643d74122595fc1fb4be Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:50 -0700 Subject: [PATCH 0926/1052] selftests/bpf: Add the traffic monitor option to test_progs. Add option '-m' to test_progs to accept names and patterns of test cases. This option will be used later to enable traffic monitor that capture network packets generated by test cases. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-3-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_progs.c | 92 +++++++++++++++++------- tools/testing/selftests/bpf/test_progs.h | 2 + 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 60fafa2f1ed7..f8ed1a16a884 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -155,6 +155,7 @@ struct prog_test_def { void (*run_serial_test)(void); bool should_run; bool need_cgroup_cleanup; + bool should_tmon; }; /* Override C runtime library's usleep() implementation to ensure nanosleep() @@ -192,39 +193,39 @@ static bool should_run(struct test_selector *sel, int num, const char *name) return num < sel->num_set_len && sel->num_set[num]; } +static bool match_subtest(struct test_filter_set *filter, + const char *test_name, + const char *subtest_name) +{ + int i, j; + + for (i = 0; i < filter->cnt; i++) { + if (glob_match(test_name, filter->tests[i].name)) { + if (!filter->tests[i].subtest_cnt) + return true; + + for (j = 0; j < filter->tests[i].subtest_cnt; j++) { + if (glob_match(subtest_name, + filter->tests[i].subtests[j])) + return true; + } + } + } + + return false; +} + static bool should_run_subtest(struct test_selector *sel, struct test_selector *subtest_sel, int subtest_num, const char *test_name, const char *subtest_name) { - int i, j; + if (match_subtest(&sel->blacklist, test_name, subtest_name)) + return false; - for (i = 0; i < sel->blacklist.cnt; i++) { - if (glob_match(test_name, sel->blacklist.tests[i].name)) { - if (!sel->blacklist.tests[i].subtest_cnt) - return false; - - for (j = 0; j < sel->blacklist.tests[i].subtest_cnt; j++) { - if (glob_match(subtest_name, - sel->blacklist.tests[i].subtests[j])) - return false; - } - } - } - - for (i = 0; i < sel->whitelist.cnt; i++) { - if (glob_match(test_name, sel->whitelist.tests[i].name)) { - if (!sel->whitelist.tests[i].subtest_cnt) - return true; - - for (j = 0; j < sel->whitelist.tests[i].subtest_cnt; j++) { - if (glob_match(subtest_name, - sel->whitelist.tests[i].subtests[j])) - return true; - } - } - } + if (match_subtest(&sel->whitelist, test_name, subtest_name)) + return true; if (!sel->whitelist.cnt && !subtest_sel->num_set) return true; @@ -232,6 +233,19 @@ static bool should_run_subtest(struct test_selector *sel, return subtest_num < subtest_sel->num_set_len && subtest_sel->num_set[subtest_num]; } +static bool should_tmon(struct test_selector *sel, const char *name) +{ + int i; + + for (i = 0; i < sel->whitelist.cnt; i++) { + if (glob_match(name, sel->whitelist.tests[i].name) && + !sel->whitelist.tests[i].subtest_cnt) + return true; + } + + return false; +} + static char *test_result(bool failed, bool skipped) { return failed ? "FAIL" : (skipped ? "SKIP" : "OK"); @@ -488,6 +502,10 @@ bool test__start_subtest(const char *subtest_name) return false; } + subtest_state->should_tmon = match_subtest(&env.tmon_selector.whitelist, + test->test_name, + subtest_name); + env.subtest_state = subtest_state; stdio_hijack_init(&subtest_state->log_buf, &subtest_state->log_cnt); @@ -667,7 +685,8 @@ enum ARG_KEYS { ARG_TEST_NAME_GLOB_DENYLIST = 'd', ARG_NUM_WORKERS = 'j', ARG_DEBUG = -1, - ARG_JSON_SUMMARY = 'J' + ARG_JSON_SUMMARY = 'J', + ARG_TRAFFIC_MONITOR = 'm', }; static const struct argp_option opts[] = { @@ -694,6 +713,10 @@ static const struct argp_option opts[] = { { "debug", ARG_DEBUG, NULL, 0, "print extra debug information for test_progs." }, { "json-summary", ARG_JSON_SUMMARY, "FILE", 0, "Write report in json format to this file."}, +#ifdef TRAFFIC_MONITOR + { "traffic-monitor", ARG_TRAFFIC_MONITOR, "NAMES", 0, + "Monitor network traffic of tests with name matching the pattern (supports '*' wildcard)." }, +#endif {}, }; @@ -905,6 +928,18 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) break; case ARGP_KEY_END: break; +#ifdef TRAFFIC_MONITOR + case ARG_TRAFFIC_MONITOR: + if (arg[0] == '@') + err = parse_test_list_file(arg + 1, + &env->tmon_selector.whitelist, + true); + else + err = parse_test_list(arg, + &env->tmon_selector.whitelist, + true); + break; +#endif default: return ARGP_ERR_UNKNOWN; } @@ -1736,6 +1771,8 @@ int main(int argc, char **argv) test->test_num, test->test_name, test->test_name, test->test_name); exit(EXIT_ERR_SETUP_INFRA); } + if (test->should_run) + test->should_tmon = should_tmon(&env.tmon_selector, test->test_name); } /* ignore workers if we are just listing */ @@ -1820,6 +1857,7 @@ int main(int argc, char **argv) free_test_selector(&env.test_selector); free_test_selector(&env.subtest_selector); + free_test_selector(&env.tmon_selector); free_test_states(); if (env.succ_cnt + env.fail_cnt + env.skip_cnt == 0) diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index cb9d6d46826b..966011eb7ec8 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -74,6 +74,7 @@ struct subtest_state { int error_cnt; bool skipped; bool filtered; + bool should_tmon; FILE *stdout_saved; }; @@ -98,6 +99,7 @@ struct test_state { struct test_env { struct test_selector test_selector; struct test_selector subtest_selector; + struct test_selector tmon_selector; bool verifier_stats; bool debug; enum verbosity verbosity; From c0247d289e73e18f6ddb0895de30c09770fbed95 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 13 Aug 2024 12:53:15 +0200 Subject: [PATCH 0927/1052] btrfs: send: annotate struct name_cache_entry with __counted_by() Add the __counted_by compiler attribute to the flexible array member name to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Signed-off-by: Thorsten Blum Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7fc692fc76e1..619fa0b8b3f6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -347,7 +347,7 @@ struct name_cache_entry { int ret; int need_later_update; int name_len; - char name[]; + char name[] __counted_by(name_len); }; /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ From 3bc2ac2f8f0b78a13140fc72022771efe0c9b778 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Aug 2024 12:30:52 -0400 Subject: [PATCH 0928/1052] btrfs: update target inode's ctime on unlink Unlink changes the link count on the target inode. POSIX mandates that the ctime must also change when this occurs. According to https://pubs.opengroup.org/onlinepubs/9699919799/functions/unlink.html: "Upon successful completion, unlink() shall mark for update the last data modification and last file status change timestamps of the parent directory. Also, if the file's link count is not 0, the last file status change timestamp of the file shall be marked for update." Signed-off-by: Jeff Layton Reviewed-by: David Sterba [ add link to the opengroup docs ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 333b0e8587a2..b1b6564ab68f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4195,6 +4195,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode(trans, dir); From 008e2512dc5696ab2dc5bf264e98a9fe9ceb830e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 11 Aug 2024 15:00:22 +0930 Subject: [PATCH 0929/1052] btrfs: tree-checker: add dev extent item checks [REPORT] There is a corruption report that btrfs refused to mount a fs that has overlapping dev extents: BTRFS error (device sdc): dev extent devid 4 physical offset 14263979671552 overlap with previous dev extent end 14263980982272 BTRFS error (device sdc): failed to verify dev extents against chunks: -117 BTRFS error (device sdc): open_ctree failed [CAUSE] The direct cause is very obvious, there is a bad dev extent item with incorrect length. With btrfs check reporting two overlapping extents, the second one shows some clue on the cause: ERROR: dev extent devid 4 offset 14263979671552 len 6488064 overlap with previous dev extent end 14263980982272 ERROR: dev extent devid 13 offset 2257707008000 len 6488064 overlap with previous dev extent end 2257707270144 ERROR: errors found in extent allocation tree or chunk allocation The second one looks like a bitflip happened during new chunk allocation: hex(2257707008000) = 0x20da9d30000 hex(2257707270144) = 0x20da9d70000 diff = 0x00000040000 So it looks like a bitflip happened during new dev extent allocation, resulting the second overlap. Currently we only do the dev-extent verification at mount time, but if the corruption is caused by memory bitflip, we really want to catch it before writing the corruption to the storage. Furthermore the dev extent items has the following key definition: ( DEV_EXTENT ) Thus we can not just rely on the generic key order check to make sure there is no overlapping. [ENHANCEMENT] Introduce dedicated dev extent checks, including: - Fixed member checks * chunk_tree should always be BTRFS_CHUNK_TREE_OBJECTID (3) * chunk_objectid should always be BTRFS_FIRST_CHUNK_CHUNK_TREE_OBJECTID (256) - Alignment checks * chunk_offset should be aligned to sectorsize * length should be aligned to sectorsize * key.offset should be aligned to sectorsize - Overlap checks If the previous key is also a dev-extent item, with the same device id, make sure we do not overlap with the previous dev extent. Reported: Stefan N Link: https://lore.kernel.org/linux-btrfs/CA+W5K0rSO3koYTo=nzxxTm1-Pdu1HYgVxEpgJ=aGc7d=E8mGEg@mail.gmail.com/ CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 69 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 6f1e2f2215d9..634d69964fe4 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1764,6 +1764,72 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf, return 0; } +static int check_dev_extent_item(const struct extent_buffer *leaf, + const struct btrfs_key *key, + int slot, + struct btrfs_key *prev_key) +{ + struct btrfs_dev_extent *de; + const u32 sectorsize = leaf->fs_info->sectorsize; + + de = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + /* Basic fixed member checks. */ + if (unlikely(btrfs_dev_extent_chunk_tree(leaf, de) != + BTRFS_CHUNK_TREE_OBJECTID)) { + generic_err(leaf, slot, + "invalid dev extent chunk tree id, has %llu expect %llu", + btrfs_dev_extent_chunk_tree(leaf, de), + BTRFS_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + if (unlikely(btrfs_dev_extent_chunk_objectid(leaf, de) != + BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + generic_err(leaf, slot, + "invalid dev extent chunk objectid, has %llu expect %llu", + btrfs_dev_extent_chunk_objectid(leaf, de), + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + /* Alignment check. */ + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent key.offset, has %llu not aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(btrfs_dev_extent_chunk_offset(leaf, de), + sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent chunk offset, has %llu not aligned to %u", + btrfs_dev_extent_chunk_objectid(leaf, de), + sectorsize); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(btrfs_dev_extent_length(leaf, de), + sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent length, has %llu not aligned to %u", + btrfs_dev_extent_length(leaf, de), sectorsize); + return -EUCLEAN; + } + /* Overlap check with previous dev extent. */ + if (slot && prev_key->objectid == key->objectid && + prev_key->type == key->type) { + struct btrfs_dev_extent *prev_de; + u64 prev_len; + + prev_de = btrfs_item_ptr(leaf, slot - 1, struct btrfs_dev_extent); + prev_len = btrfs_dev_extent_length(leaf, prev_de); + if (unlikely(prev_key->offset + prev_len > key->offset)) { + generic_err(leaf, slot, + "dev extent overlap, prev offset %llu len %llu current offset %llu", + prev_key->objectid, prev_len, key->offset); + return -EUCLEAN; + } + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -1800,6 +1866,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(leaf, key, slot); break; + case BTRFS_DEV_EXTENT_KEY: + ret = check_dev_extent_item(leaf, key, slot, prev_key); + break; case BTRFS_INODE_ITEM_KEY: ret = check_inode_item(leaf, key, slot); break; From e30729d4bd4001881be4d1ad4332a5d4985398f8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 1 Aug 2024 16:47:52 +0900 Subject: [PATCH 0930/1052] btrfs: zoned: properly take lock to read/update block group's zoned variables __btrfs_add_free_space_zoned() references and modifies bg's alloc_offset, ro, and zone_unusable, but without taking the lock. It is mostly safe because they monotonically increase (at least for now) and this function is mostly called by a transaction commit, which is serialized by itself. Still, taking the lock is a safer and correct option and I'm going to add a change to reset zone_unusable while a block group is still alive. So, add locking around the operations. Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f5996a43db24..eaa1dbd31352 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2697,15 +2697,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; int bg_reclaim_threshold = 0; - bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); + bool initial; u64 reclaimable_unusable; - WARN_ON(!initial && offset + size > block_group->zone_capacity); + spin_lock(&block_group->lock); + initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); + WARN_ON(!initial && offset + size > block_group->zone_capacity); if (!initial) bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); - spin_lock(&ctl->tree_lock); if (!used) to_free = size; else if (initial) @@ -2718,7 +2719,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, to_free = offset + size - block_group->alloc_offset; to_unusable = size - to_free; + spin_lock(&ctl->tree_lock); ctl->free_space += to_free; + spin_unlock(&ctl->tree_lock); /* * If the block group is read-only, we should account freed space into * bytes_readonly. @@ -2727,11 +2730,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, block_group->zone_unusable += to_unusable; WARN_ON(block_group->zone_unusable > block_group->length); } - spin_unlock(&ctl->tree_lock); if (!used) { - spin_lock(&block_group->lock); block_group->alloc_offset -= size; - spin_unlock(&block_group->lock); } reclaimable_unusable = block_group->zone_unusable - @@ -2745,6 +2745,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, btrfs_mark_bg_to_reclaim(block_group); } + spin_unlock(&block_group->lock); + return 0; } From 6486cad00a8b7f8585983408c152bbe33dda529b Mon Sep 17 00:00:00 2001 From: David Gstir Date: Wed, 17 Jul 2024 13:28:44 +0200 Subject: [PATCH 0931/1052] KEYS: trusted: fix DCP blob payload length assignment The DCP trusted key type uses the wrong helper function to store the blob's payload length which can lead to the wrong byte order being used in case this would ever run on big endian architectures. Fix by using correct helper function. Cc: stable@vger.kernel.org # v6.10+ Fixes: 2e8a0f40a39c ("KEYS: trusted: Introduce NXP DCP-backed trusted keys") Suggested-by: Richard Weinberger Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405240610.fj53EK0q-lkp@intel.com/ Signed-off-by: David Gstir Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_dcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/trusted-keys/trusted_dcp.c b/security/keys/trusted-keys/trusted_dcp.c index b5f81a05be36..b0947f072a98 100644 --- a/security/keys/trusted-keys/trusted_dcp.c +++ b/security/keys/trusted-keys/trusted_dcp.c @@ -222,7 +222,7 @@ static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) return ret; } - b->payload_len = get_unaligned_le32(&p->key_len); + put_unaligned_le32(p->key_len, &b->payload_len); p->blob_len = blen; return 0; } From 0e28bf61a5f9ab30be3f3b4eafb8d097e39446bb Mon Sep 17 00:00:00 2001 From: David Gstir Date: Wed, 17 Jul 2024 13:28:45 +0200 Subject: [PATCH 0932/1052] KEYS: trusted: dcp: fix leak of blob encryption key Trusted keys unseal the key blob on load, but keep the sealed payload in the blob field so that every subsequent read (export) will simply convert this field to hex and send it to userspace. With DCP-based trusted keys, we decrypt the blob encryption key (BEK) in the Kernel due hardware limitations and then decrypt the blob payload. BEK decryption is done in-place which means that the trusted key blob field is modified and it consequently holds the BEK in plain text. Every subsequent read of that key thus send the plain text BEK instead of the encrypted BEK to userspace. This issue only occurs when importing a trusted DCP-based key and then exporting it again. This should rarely happen as the common use cases are to either create a new trusted key and export it, or import a key blob and then just use it without exporting it again. Fix this by performing BEK decryption and encryption in a dedicated buffer. Further always wipe the plain text BEK buffer to prevent leaking the key via uninitialized memory. Cc: stable@vger.kernel.org # v6.10+ Fixes: 2e8a0f40a39c ("KEYS: trusted: Introduce NXP DCP-backed trusted keys") Signed-off-by: David Gstir Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_dcp.c | 33 +++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/security/keys/trusted-keys/trusted_dcp.c b/security/keys/trusted-keys/trusted_dcp.c index b0947f072a98..4edc5bbbcda3 100644 --- a/security/keys/trusted-keys/trusted_dcp.c +++ b/security/keys/trusted-keys/trusted_dcp.c @@ -186,20 +186,21 @@ static int do_aead_crypto(u8 *in, u8 *out, size_t len, u8 *key, u8 *nonce, return ret; } -static int decrypt_blob_key(u8 *key) +static int decrypt_blob_key(u8 *encrypted_key, u8 *plain_key) { - return do_dcp_crypto(key, key, false); + return do_dcp_crypto(encrypted_key, plain_key, false); } -static int encrypt_blob_key(u8 *key) +static int encrypt_blob_key(u8 *plain_key, u8 *encrypted_key) { - return do_dcp_crypto(key, key, true); + return do_dcp_crypto(plain_key, encrypted_key, true); } static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) { struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob; int blen, ret; + u8 plain_blob_key[AES_KEYSIZE_128]; blen = calc_blob_len(p->key_len); if (blen > MAX_BLOB_SIZE) @@ -207,30 +208,36 @@ static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) b->fmt_version = DCP_BLOB_VERSION; get_random_bytes(b->nonce, AES_KEYSIZE_128); - get_random_bytes(b->blob_key, AES_KEYSIZE_128); + get_random_bytes(plain_blob_key, AES_KEYSIZE_128); - ret = do_aead_crypto(p->key, b->payload, p->key_len, b->blob_key, + ret = do_aead_crypto(p->key, b->payload, p->key_len, plain_blob_key, b->nonce, true); if (ret) { pr_err("Unable to encrypt blob payload: %i\n", ret); - return ret; + goto out; } - ret = encrypt_blob_key(b->blob_key); + ret = encrypt_blob_key(plain_blob_key, b->blob_key); if (ret) { pr_err("Unable to encrypt blob key: %i\n", ret); - return ret; + goto out; } put_unaligned_le32(p->key_len, &b->payload_len); p->blob_len = blen; - return 0; + ret = 0; + +out: + memzero_explicit(plain_blob_key, sizeof(plain_blob_key)); + + return ret; } static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) { struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob; int blen, ret; + u8 plain_blob_key[AES_KEYSIZE_128]; if (b->fmt_version != DCP_BLOB_VERSION) { pr_err("DCP blob has bad version: %i, expected %i\n", @@ -248,14 +255,14 @@ static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) goto out; } - ret = decrypt_blob_key(b->blob_key); + ret = decrypt_blob_key(b->blob_key, plain_blob_key); if (ret) { pr_err("Unable to decrypt blob key: %i\n", ret); goto out; } ret = do_aead_crypto(b->payload, p->key, p->key_len + DCP_BLOB_AUTHLEN, - b->blob_key, b->nonce, false); + plain_blob_key, b->nonce, false); if (ret) { pr_err("Unwrap of DCP payload failed: %i\n", ret); goto out; @@ -263,6 +270,8 @@ static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) ret = 0; out: + memzero_explicit(plain_blob_key, sizeof(plain_blob_key)); + return ret; } From 1e115a58be0ffca63727dc0495dae924a19f8cd4 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:51 -0700 Subject: [PATCH 0933/1052] selftests/bpf: netns_new() and netns_free() helpers. netns_new()/netns_free() create/delete network namespaces. They support the option '-m' of test_progs to start/stop traffic monitor for the network namespace being created for matched tests. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-4-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 46 ++++++++++ tools/testing/selftests/bpf/network_helpers.h | 2 + tools/testing/selftests/bpf/test_progs.c | 88 +++++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 4 + 4 files changed, 140 insertions(+) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index e0b0a69f066e..27784946b01b 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -446,6 +446,52 @@ char *ping_command(int family) return "ping"; } +int remove_netns(const char *name) +{ + char *cmd; + int r; + + r = asprintf(&cmd, "ip netns del %s >/dev/null 2>&1", name); + if (r < 0) { + log_err("Failed to malloc cmd"); + return -1; + } + + r = system(cmd); + free(cmd); + return r; +} + +int make_netns(const char *name) +{ + char *cmd; + int r; + + r = asprintf(&cmd, "ip netns add %s", name); + if (r < 0) { + log_err("Failed to malloc cmd"); + return -1; + } + + r = system(cmd); + free(cmd); + + if (r) + return r; + + r = asprintf(&cmd, "ip -n %s link set lo up", name); + if (r < 0) { + log_err("Failed to malloc cmd for setting up lo"); + remove_netns(name); + return -1; + } + + r = system(cmd); + free(cmd); + + return r; +} + struct nstoken { int orig_netns_fd; }; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 0d032ae706c6..c72c16e1aff8 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -93,6 +93,8 @@ struct nstoken; struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); int send_recv_data(int lfd, int fd, uint32_t total_bytes); +int make_netns(const char *name); +int remove_netns(const char *name); static __u16 csum_fold(__u32 csum) { diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index f8ed1a16a884..f45b06791444 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -18,6 +18,8 @@ #include #include "json_writer.h" +#include "network_helpers.h" + #ifdef __GLIBC__ #include /* backtrace */ #endif @@ -642,6 +644,92 @@ int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len) return err; } +struct netns_obj { + char *nsname; + struct tmonitor_ctx *tmon; + struct nstoken *nstoken; +}; + +/* Create a new network namespace with the given name. + * + * Create a new network namespace and set the network namespace of the + * current process to the new network namespace if the argument "open" is + * true. This function should be paired with netns_free() to release the + * resource and delete the network namespace. + * + * It also implements the functionality of the option "-m" by starting + * traffic monitor on the background to capture the packets in this network + * namespace if the current test or subtest matching the pattern. + * + * nsname: the name of the network namespace to create. + * open: open the network namespace if true. + * + * Return: the network namespace object on success, NULL on failure. + */ +struct netns_obj *netns_new(const char *nsname, bool open) +{ + struct netns_obj *netns_obj = malloc(sizeof(*netns_obj)); + const char *test_name, *subtest_name; + int r; + + if (!netns_obj) + return NULL; + memset(netns_obj, 0, sizeof(*netns_obj)); + + netns_obj->nsname = strdup(nsname); + if (!netns_obj->nsname) + goto fail; + + /* Create the network namespace */ + r = make_netns(nsname); + if (r) + goto fail; + + /* Start traffic monitor */ + if (env.test->should_tmon || + (env.subtest_state && env.subtest_state->should_tmon)) { + test_name = env.test->test_name; + subtest_name = env.subtest_state ? env.subtest_state->name : NULL; + netns_obj->tmon = traffic_monitor_start(nsname, test_name, subtest_name); + if (!netns_obj->tmon) { + fprintf(stderr, "Failed to start traffic monitor for %s\n", nsname); + goto fail; + } + } else { + netns_obj->tmon = NULL; + } + + if (open) { + netns_obj->nstoken = open_netns(nsname); + if (!netns_obj->nstoken) + goto fail; + } + + return netns_obj; +fail: + traffic_monitor_stop(netns_obj->tmon); + remove_netns(nsname); + free(netns_obj->nsname); + free(netns_obj); + return NULL; +} + +/* Delete the network namespace. + * + * This function should be paired with netns_new() to delete the namespace + * created by netns_new(). + */ +void netns_free(struct netns_obj *netns_obj) +{ + if (!netns_obj) + return; + traffic_monitor_stop(netns_obj->tmon); + close_netns(netns_obj->nstoken); + remove_netns(netns_obj->nsname); + free(netns_obj->nsname); + free(netns_obj); +} + /* extern declarations for test funcs */ #define DEFINE_TEST(name) \ extern void test_##name(void) __weak; \ diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 966011eb7ec8..3ad131de14c6 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -430,6 +430,10 @@ int write_sysctl(const char *sysctl, const char *value); int get_bpf_max_tramp_links_from(struct btf *btf); int get_bpf_max_tramp_links(void); +struct netns_obj; +struct netns_obj *netns_new(const char *name, bool open); +void netns_free(struct netns_obj *netns); + #ifdef __x86_64__ #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" #elif defined(__s390x__) From 52a5b8a30fa882c4d363f7679b7db4a093550ac1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:52 -0700 Subject: [PATCH 0934/1052] selftests/bpf: Monitor traffic for tc_redirect. Enable traffic monitoring for the test case tc_redirect. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-5-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/tc_redirect.c | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 53b8ffc943dc..974f9d6269c9 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -68,6 +68,7 @@ __FILE__, __LINE__, strerror(errno), ##__VA_ARGS__) static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL}; +static struct netns_obj *netns_objs[3]; static int write_file(const char *path, const char *newval) { @@ -87,27 +88,41 @@ static int write_file(const char *path, const char *newval) static int netns_setup_namespaces(const char *verb) { + struct netns_obj **ns_obj = netns_objs; const char * const *ns = namespaces; - char cmd[128]; while (*ns) { - snprintf(cmd, sizeof(cmd), "ip netns %s %s", verb, *ns); - if (!ASSERT_OK(system(cmd), cmd)) - return -1; + if (strcmp(verb, "add") == 0) { + *ns_obj = netns_new(*ns, false); + if (!ASSERT_OK_PTR(*ns_obj, "netns_new")) + return -1; + } else { + if (!ASSERT_OK_PTR(*ns_obj, "netns_obj is NULL")) + return -1; + netns_free(*ns_obj); + *ns_obj = NULL; + } ns++; + ns_obj++; } return 0; } static void netns_setup_namespaces_nofail(const char *verb) { + struct netns_obj **ns_obj = netns_objs; const char * const *ns = namespaces; - char cmd[128]; while (*ns) { - snprintf(cmd, sizeof(cmd), "ip netns %s %s > /dev/null 2>&1", verb, *ns); - system(cmd); + if (strcmp(verb, "add") == 0) { + *ns_obj = netns_new(*ns, false); + } else { + if (*ns_obj) + netns_free(*ns_obj); + *ns_obj = NULL; + } ns++; + ns_obj++; } } From b407b52b18505a7c09a77cbab022ec6cf82dc5f0 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:53 -0700 Subject: [PATCH 0935/1052] selftests/bpf: Monitor traffic for sockmap_listen. Enable traffic monitor for each subtest of sockmap_listen. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-6-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 9ce0e0e0b7da..ebb5f6336052 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1925,6 +1925,7 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map int family) { const char *family_name, *map_name; + struct netns_obj *netns; char s[MAX_TEST_NAME]; family_name = family_str(family); @@ -1932,8 +1933,15 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); if (!test__start_subtest(s)) return; + + netns = netns_new("sockmap_listen", true); + if (!ASSERT_OK_PTR(netns, "netns_new")) + return; + inet_unix_skb_redir_to_connected(skel, map, family); unix_inet_skb_redir_to_connected(skel, map, family); + + netns_free(netns); } static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, From 69354085975ac61632ebceedebbeeeb0272620b8 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:54 -0700 Subject: [PATCH 0936/1052] selftests/bpf: Monitor traffic for select_reuseport. Enable traffic monitoring for the subtests of select_reuseport. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-7-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/select_reuseport.c | 37 +++++++------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 64c5f5eb2994..036d4760d2c1 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -37,9 +37,7 @@ static int sk_fds[REUSEPORT_ARRAY_SIZE]; static int reuseport_array = -1, outer_map = -1; static enum bpf_map_type inner_map_type; static int select_by_skb_data_prog; -static int saved_tcp_syncookie = -1; static struct bpf_object *obj; -static int saved_tcp_fo = -1; static __u32 index_zero; static int epfd; @@ -193,14 +191,6 @@ static int write_int_sysctl(const char *sysctl, int v) return 0; } -static void restore_sysctls(void) -{ - if (saved_tcp_fo != -1) - write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); - if (saved_tcp_syncookie != -1) - write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); -} - static int enable_fastopen(void) { int fo; @@ -793,6 +783,7 @@ static void test_config(int sotype, sa_family_t family, bool inany) TEST_INIT(test_pass_on_err), TEST_INIT(test_detach_bpf), }; + struct netns_obj *netns; char s[MAX_TEST_NAME]; const struct test *t; @@ -808,9 +799,21 @@ static void test_config(int sotype, sa_family_t family, bool inany) if (!test__start_subtest(s)) continue; + netns = netns_new("select_reuseport", true); + if (!ASSERT_OK_PTR(netns, "netns_new")) + continue; + + if (CHECK_FAIL(enable_fastopen())) + goto out; + if (CHECK_FAIL(disable_syncookie())) + goto out; + setup_per_test(sotype, family, inany, t->no_inner_map); t->fn(sotype, family); cleanup_per_test(t->no_inner_map); + +out: + netns_free(netns); } } @@ -850,21 +853,7 @@ void test_map_type(enum bpf_map_type mt) void serial_test_select_reuseport(void) { - saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); - if (saved_tcp_fo < 0) - goto out; - saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); - if (saved_tcp_syncookie < 0) - goto out; - - if (enable_fastopen()) - goto out; - if (disable_syncookie()) - goto out; - test_map_type(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); test_map_type(BPF_MAP_TYPE_SOCKMAP); test_map_type(BPF_MAP_TYPE_SOCKHASH); -out: - restore_sysctls(); } From e01d48c699bbe015d887cb598e4047f08f3998a8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 14 Aug 2024 21:26:19 +0200 Subject: [PATCH 0937/1052] riscv: Fix out-of-bounds when accessing Andes per hart vendor extension array The out-of-bounds access is reported by UBSAN: [ 0.000000] UBSAN: array-index-out-of-bounds in ../arch/riscv/kernel/vendor_extensions.c:41:66 [ 0.000000] index -1 is out of range for type 'riscv_isavendorinfo [32]' [ 0.000000] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.11.0-rc2ubuntu-defconfig #2 [ 0.000000] Hardware name: riscv-virtio,qemu (DT) [ 0.000000] Call Trace: [ 0.000000] [] dump_backtrace+0x32/0x40 [ 0.000000] [] show_stack+0x38/0x44 [ 0.000000] [] dump_stack_lvl+0x70/0x9c [ 0.000000] [] dump_stack+0x18/0x20 [ 0.000000] [] ubsan_epilogue+0x10/0x46 [ 0.000000] [] __ubsan_handle_out_of_bounds+0x94/0x9c [ 0.000000] [] __riscv_isa_vendor_extension_available+0x90/0x92 [ 0.000000] [] riscv_cpufeature_patch_func+0xc4/0x148 [ 0.000000] [] _apply_alternatives+0x42/0x50 [ 0.000000] [] apply_boot_alternatives+0x3c/0x100 [ 0.000000] [] setup_arch+0x85a/0x8bc [ 0.000000] [] start_kernel+0xa4/0xfb6 The dereferencing using cpu should actually not happen, so remove it. Fixes: 23c996fc2bc1 ("riscv: Extend cpufeature.c to detect vendor extensions") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240814192619.276794-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vendor_extensions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c index b6c1e7b5d34b..a8126d118341 100644 --- a/arch/riscv/kernel/vendor_extensions.c +++ b/arch/riscv/kernel/vendor_extensions.c @@ -38,7 +38,7 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES case ANDES_VENDOR_ID: bmap = &riscv_isa_vendor_ext_list_andes.all_harts_isa_bitmap; - cpu_bmap = &riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap[cpu]; + cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap; break; #endif default: From 74c2ab6d653b4c2354df65a7f7f2df1925a40a51 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 8 Aug 2024 20:23:32 +0800 Subject: [PATCH 0938/1052] smb/client: avoid possible NULL dereference in cifs_free_subrequest() Clang static checker (scan-build) warning: cifsglob.h:line 890, column 3 Access to field 'ops' results in a dereference of a null pointer. Commit 519be989717c ("cifs: Add a tracepoint to track credits involved in R/W requests") adds a check for 'rdata->server', and let clang throw this warning about NULL dereference. When 'rdata->credits.value != 0 && rdata->server == NULL' happens, add_credits_and_wake_if() will call rdata->server->ops->add_credits(). This will cause NULL dereference problem. Add a check for 'rdata->server' to avoid NULL dereference. Cc: stable@vger.kernel.org Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Reviewed-by: David Howells Signed-off-by: Su Hui Signed-off-by: Steve French --- fs/smb/client/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index b2405dd4d4d4..45459af5044d 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -315,7 +315,7 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq) #endif } - if (rdata->credits.value != 0) + if (rdata->credits.value != 0) { trace_smb3_rw_credits(rdata->rreq->debug_id, rdata->subreq.debug_index, rdata->credits.value, @@ -323,8 +323,12 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq) rdata->server ? rdata->server->in_flight : 0, -rdata->credits.value, cifs_trace_rw_credits_free_subreq); + if (rdata->server) + add_credits_and_wake_if(rdata->server, &rdata->credits, 0); + else + rdata->credits.value = 0; + } - add_credits_and_wake_if(rdata->server, &rdata->credits, 0); if (rdata->have_xid) free_xid(rdata->xid); } From c916ca35308d3187c9928664f9be249b22a3a701 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 3 Aug 2024 17:11:37 +0800 Subject: [PATCH 0939/1052] md/raid1: Fix data corruption for degraded array with slow disk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read_balance() will avoid reading from slow disks as much as possible, however, if valid data only lands in slow disks, and a new normal disk is still in recovery, unrecovered data can be read: raid1_read_request read_balance raid1_should_read_first -> return false choose_best_rdev -> normal disk is not recovered, return -1 choose_bb_rdev -> missing the checking of recovery, return the normal disk -> read unrecovered data Root cause is that the checking of recovery is missing in choose_bb_rdev(). Hence add such checking to fix the problem. Also fix similar problem in choose_slow_rdev(). Cc: stable@vger.kernel.org Fixes: 9f3ced792203 ("md/raid1: factor out choose_bb_rdev() from read_balance()") Fixes: dfa8ecd167c1 ("md/raid1: factor out choose_slow_rdev() from read_balance()") Reported-and-tested-by: Mateusz Jończyk Closes: https://lore.kernel.org/all/9952f532-2554-44bf-b906-4880b2e88e3a@o2.pl/ Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240803091137.3197008-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7acfe7c9dc8d..761989d67906 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -617,6 +617,12 @@ static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, return -1; } +static bool rdev_in_recovery(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + return !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < r1_bio->sector + r1_bio->sectors; +} + static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) { @@ -635,6 +641,7 @@ static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, rdev = conf->mirrors[disk].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || + rdev_in_recovery(rdev, r1_bio) || test_bit(WriteMostly, &rdev->flags)) continue; @@ -673,7 +680,8 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, rdev = conf->mirrors[disk].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || - !test_bit(WriteMostly, &rdev->flags)) + !test_bit(WriteMostly, &rdev->flags) || + rdev_in_recovery(rdev, r1_bio)) continue; /* there are no bad blocks, we can use this disk */ @@ -733,9 +741,7 @@ static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) if (!rdev || test_bit(Faulty, &rdev->flags)) return false; - /* still in recovery */ - if (!test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < r1_bio->sector + r1_bio->sectors) + if (rdev_in_recovery(rdev, r1_bio)) return false; /* don't read from slow disk unless have to */ From 836bb3268db405cf9021496ac4dbc26d3e4758fe Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 15 Aug 2024 14:03:43 -0500 Subject: [PATCH 0940/1052] smb3: fix lock breakage for cached writes Mandatory locking is enforced for cached writes, which violates default posix semantics, and also it is enforced inconsistently. This apparently breaks recent versions of libreoffice, but can also be demonstrated by opening a file twice from the same client, locking it from handle one and writing to it from handle two (which fails, returning EACCES). Since there was already a mount option "forcemandatorylock" (which defaults to off), with this change only when the user intentionally specifies "forcemandatorylock" on mount will we break posix semantics on write to a locked range (ie we will only fail the write in this case, if the user mounts with "forcemandatorylock"). Fixes: 85160e03a79e ("CIFS: Implement caching mechanism for mandatory brlocks") Cc: stable@vger.kernel.org Cc: Pavel Shilovsky Reported-by: abartlet@samba.org Reported-by: Kevin Ottens Reviewed-by: David Howells Signed-off-by: Steve French --- fs/smb/client/file.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 45459af5044d..06a0667f8ff2 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2753,6 +2753,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); ssize_t rc; rc = netfs_start_io_write(inode); @@ -2769,12 +2770,16 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) if (rc <= 0) goto out; - if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) && + (cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), server->vals->exclusive_lock_type, 0, - NULL, CIFS_WRITE_OP)) - rc = netfs_buffered_write_iter_locked(iocb, from, NULL); - else + NULL, CIFS_WRITE_OP))) { rc = -EACCES; + goto out; + } + + rc = netfs_buffered_write_iter_locked(iocb, from, NULL); + out: up_read(&cinode->lock_sem); netfs_end_io_write(inode); From 5b4f3af39b6588e8de4444d8e1ccf759b40f9414 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 8 Aug 2024 16:04:04 -0600 Subject: [PATCH 0941/1052] smb: smb2pdu.h: Use static_assert() to check struct sizes Commit 9f9bef9bc5c6 ("smb: smb2pdu.h: Avoid -Wflex-array-member-not-at-end warnings") introduced tagged `struct create_context_hdr`. We want to ensure that when new members need to be added to the flexible structure, they are always included within this tagged struct. So, we use `static_assert()` to ensure that the memory layout for both the flexible structure and the tagged struct is the same after any changes. Acked-by: Namjae Jeon Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steve French --- fs/smb/common/smb2pdu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c3ee42188d25..c769f9dbc0b4 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1216,6 +1216,8 @@ struct create_context { ); __u8 Buffer[]; } __packed; +static_assert(offsetof(struct create_context, Buffer) == sizeof(struct create_context_hdr), + "struct member likely outside of __struct_group()"); struct smb2_create_req { struct smb2_hdr hdr; From febb6f3e3ac196602cca5738d8a189d57392324a Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Wed, 7 Aug 2024 23:31:10 +0900 Subject: [PATCH 0942/1052] bpf: Remove __btf_name_valid() and change to btf_name_valid_identifier() __btf_name_valid() can be completely replaced with btf_name_valid_identifier, and since most of the time you already call btf_name_valid_identifier instead of __btf_name_valid , it would be appropriate to rename the __btf_name_valid function to btf_name_valid_identifier and remove __btf_name_valid. Signed-off-by: Jeongjun Park Signed-off-by: Andrii Nakryiko Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/20240807143110.181497-1-aha310510@gmail.com --- kernel/bpf/btf.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 95426d5b634e..bfb0d89ccc8b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -790,7 +790,7 @@ const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset) +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); @@ -811,11 +811,6 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset) return !*src; } -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) -{ - return __btf_name_valid(btf, offset); -} - /* Allow any printable character in DATASEC names */ static bool btf_name_valid_section(const struct btf *btf, u32 offset) { @@ -4629,7 +4624,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off)) { + !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } From fab45b962749184e1a1a57c7c583782b78fad539 Mon Sep 17 00:00:00 2001 From: Sam James Date: Tue, 13 Aug 2024 20:49:06 +0100 Subject: [PATCH 0943/1052] libbpf: Workaround -Wmaybe-uninitialized false positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `elf_close`, we get this with GCC 15 -O3 (at least): ``` In function ‘elf_close’, inlined from ‘elf_close’ at elf.c:53:6, inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2: elf.c:57:9: warning: ‘elf_fd.elf’ may be used uninitialized [-Wmaybe-uninitialized] 57 | elf_end(elf_fd->elf); | ^~~~~~~~~~~~~~~~~~~~ elf.c: In function ‘elf_find_func_offset_from_file’: elf.c:377:23: note: ‘elf_fd.elf’ was declared here 377 | struct elf_fd elf_fd; | ^~~~~~ In function ‘elf_close’, inlined from ‘elf_close’ at elf.c:53:6, inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2: elf.c:58:9: warning: ‘elf_fd.fd’ may be used uninitialized [-Wmaybe-uninitialized] 58 | close(elf_fd->fd); | ^~~~~~~~~~~~~~~~~ elf.c: In function ‘elf_find_func_offset_from_file’: elf.c:377:23: note: ‘elf_fd.fd’ was declared here 377 | struct elf_fd elf_fd; | ^~~~~~ ``` In reality, our use is fine, it's just that GCC doesn't model errno here (see linked GCC bug). Suppress -Wmaybe-uninitialized accordingly by initializing elf_fd.fd to -1 and elf_fd.elf to NULL. I've done this in two other functions as well given it could easily occur there too (same access/use pattern). Signed-off-by: Sam James Signed-off-by: Andrii Nakryiko Link: https://gcc.gnu.org/PR114952 Link: https://lore.kernel.org/bpf/14ec488a1cac02794c2fa2b83ae0cef1bce2cb36.1723578546.git.sam@gentoo.org --- tools/lib/bpf/elf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c index c92e02394159..b5ab1cb13e5e 100644 --- a/tools/lib/bpf/elf.c +++ b/tools/lib/bpf/elf.c @@ -28,6 +28,9 @@ int elf_open(const char *binary_path, struct elf_fd *elf_fd) int fd, ret; Elf *elf; + elf_fd->elf = NULL; + elf_fd->fd = -1; + if (elf_version(EV_CURRENT) == EV_NONE) { pr_warn("elf: failed to init libelf for %s\n", binary_path); return -LIBBPF_ERRNO__LIBELF; From fdf1c728fac541891ef1aa773bfd42728626769c Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Thu, 15 Aug 2024 21:55:24 +0800 Subject: [PATCH 0944/1052] samples/bpf: Fix compilation errors with cf-protection option Currently, compiling the bpf programs will result the compilation errors with the cf-protection option as follows in arm64 and loongarch64 machine when using gcc 12.3.1 and clang 17.0.6. This commit fixes the compilation errors by limited the cf-protection option only used in x86 platform. [root@localhost linux]# make M=samples/bpf ...... CLANG-bpf samples/bpf/xdp2skb_meta_kern.o error: option 'cf-protection=return' cannot be specified on this target error: option 'cf-protection=branch' cannot be specified on this target 2 errors generated. CLANG-bpf samples/bpf/syscall_tp_kern.o error: option 'cf-protection=return' cannot be specified on this target error: option 'cf-protection=branch' cannot be specified on this target 2 errors generated. ...... Fixes: 34f6e38f58db ("samples/bpf: fix warning with ignored-attributes") Reported-by: Jiangshan Yi Signed-off-by: Jiangshan Yi Signed-off-by: Andrii Nakryiko Tested-by: Qiang Wang Link: https://lore.kernel.org/bpf/20240815135524.140675-1-13667453960@163.com --- samples/bpf/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3e003dd6bea0..dca56aa360ff 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -169,6 +169,10 @@ BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic endif endif +ifeq ($(ARCH), x86) +BPF_EXTRA_CFLAGS += -fcf-protection +endif + TPROGS_CFLAGS += -Wall -O2 TPROGS_CFLAGS += -Wmissing-prototypes TPROGS_CFLAGS += -Wstrict-prototypes @@ -405,7 +409,7 @@ $(obj)/%.o: $(src)/%.c -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ - -fno-asynchronous-unwind-tables -fcf-protection \ + -fno-asynchronous-unwind-tables \ -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ From b313a8c835516bdda85025500be866ac8a74e022 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 15 Aug 2024 10:47:36 +0800 Subject: [PATCH 0945/1052] block: Fix lockdep warning in blk_mq_mark_tag_wait Lockdep reported a warning in Linux version 6.6: [ 414.344659] ================================ [ 414.345155] WARNING: inconsistent lock state [ 414.345658] 6.6.0-07439-gba2303cacfda #6 Not tainted [ 414.346221] -------------------------------- [ 414.346712] inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. [ 414.347545] kworker/u10:3/1152 [HC0[0]:SC0[0]:HE0:SE1] takes: [ 414.349245] ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 [ 414.351204] {IN-SOFTIRQ-W} state was registered at: [ 414.351751] lock_acquire+0x18d/0x460 [ 414.352218] _raw_spin_lock_irqsave+0x39/0x60 [ 414.352769] __wake_up_common_lock+0x22/0x60 [ 414.353289] sbitmap_queue_wake_up+0x375/0x4f0 [ 414.353829] sbitmap_queue_clear+0xdd/0x270 [ 414.354338] blk_mq_put_tag+0xdf/0x170 [ 414.354807] __blk_mq_free_request+0x381/0x4d0 [ 414.355335] blk_mq_free_request+0x28b/0x3e0 [ 414.355847] __blk_mq_end_request+0x242/0xc30 [ 414.356367] scsi_end_request+0x2c1/0x830 [ 414.345155] WARNING: inconsistent lock state [ 414.345658] 6.6.0-07439-gba2303cacfda #6 Not tainted [ 414.346221] -------------------------------- [ 414.346712] inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. [ 414.347545] kworker/u10:3/1152 [HC0[0]:SC0[0]:HE0:SE1] takes: [ 414.349245] ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 [ 414.351204] {IN-SOFTIRQ-W} state was registered at: [ 414.351751] lock_acquire+0x18d/0x460 [ 414.352218] _raw_spin_lock_irqsave+0x39/0x60 [ 414.352769] __wake_up_common_lock+0x22/0x60 [ 414.353289] sbitmap_queue_wake_up+0x375/0x4f0 [ 414.353829] sbitmap_queue_clear+0xdd/0x270 [ 414.354338] blk_mq_put_tag+0xdf/0x170 [ 414.354807] __blk_mq_free_request+0x381/0x4d0 [ 414.355335] blk_mq_free_request+0x28b/0x3e0 [ 414.355847] __blk_mq_end_request+0x242/0xc30 [ 414.356367] scsi_end_request+0x2c1/0x830 [ 414.356863] scsi_io_completion+0x177/0x1610 [ 414.357379] scsi_complete+0x12f/0x260 [ 414.357856] blk_complete_reqs+0xba/0xf0 [ 414.358338] __do_softirq+0x1b0/0x7a2 [ 414.358796] irq_exit_rcu+0x14b/0x1a0 [ 414.359262] sysvec_call_function_single+0xaf/0xc0 [ 414.359828] asm_sysvec_call_function_single+0x1a/0x20 [ 414.360426] default_idle+0x1e/0x30 [ 414.360873] default_idle_call+0x9b/0x1f0 [ 414.361390] do_idle+0x2d2/0x3e0 [ 414.361819] cpu_startup_entry+0x55/0x60 [ 414.362314] start_secondary+0x235/0x2b0 [ 414.362809] secondary_startup_64_no_verify+0x18f/0x19b [ 414.363413] irq event stamp: 428794 [ 414.363825] hardirqs last enabled at (428793): [] ktime_get+0x1dc/0x200 [ 414.364694] hardirqs last disabled at (428794): [] _raw_spin_lock_irq+0x47/0x50 [ 414.365629] softirqs last enabled at (428444): [] __do_softirq+0x540/0x7a2 [ 414.366522] softirqs last disabled at (428419): [] irq_exit_rcu+0x14b/0x1a0 [ 414.367425] other info that might help us debug this: [ 414.368194] Possible unsafe locking scenario: [ 414.368900] CPU0 [ 414.369225] ---- [ 414.369548] lock(&sbq->ws[i].wait); [ 414.370000] [ 414.370342] lock(&sbq->ws[i].wait); [ 414.370802] *** DEADLOCK *** [ 414.371569] 5 locks held by kworker/u10:3/1152: [ 414.372088] #0: ffff88810130e938 ((wq_completion)writeback){+.+.}-{0:0}, at: process_scheduled_works+0x357/0x13f0 [ 414.373180] #1: ffff88810201fdb8 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at: process_scheduled_works+0x3a3/0x13f0 [ 414.374384] #2: ffffffff86ffbdc0 (rcu_read_lock){....}-{1:2}, at: blk_mq_run_hw_queue+0x637/0xa00 [ 414.375342] #3: ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 [ 414.376377] #4: ffff888106205a08 (&hctx->dispatch_wait_lock){+.-.}-{2:2}, at: blk_mq_dispatch_rq_list+0x1337/0x1ee0 [ 414.378607] stack backtrace: [ 414.379177] CPU: 0 PID: 1152 Comm: kworker/u10:3 Not tainted 6.6.0-07439-gba2303cacfda #6 [ 414.380032] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 414.381177] Workqueue: writeback wb_workfn (flush-253:0) [ 414.381805] Call Trace: [ 414.382136] [ 414.382429] dump_stack_lvl+0x91/0xf0 [ 414.382884] mark_lock_irq+0xb3b/0x1260 [ 414.383367] ? __pfx_mark_lock_irq+0x10/0x10 [ 414.383889] ? stack_trace_save+0x8e/0xc0 [ 414.384373] ? __pfx_stack_trace_save+0x10/0x10 [ 414.384903] ? graph_lock+0xcf/0x410 [ 414.385350] ? save_trace+0x3d/0xc70 [ 414.385808] mark_lock.part.20+0x56d/0xa90 [ 414.386317] mark_held_locks+0xb0/0x110 [ 414.386791] ? __pfx_do_raw_spin_lock+0x10/0x10 [ 414.387320] lockdep_hardirqs_on_prepare+0x297/0x3f0 [ 414.387901] ? _raw_spin_unlock_irq+0x28/0x50 [ 414.388422] trace_hardirqs_on+0x58/0x100 [ 414.388917] _raw_spin_unlock_irq+0x28/0x50 [ 414.389422] __blk_mq_tag_busy+0x1d6/0x2a0 [ 414.389920] __blk_mq_get_driver_tag+0x761/0x9f0 [ 414.390899] blk_mq_dispatch_rq_list+0x1780/0x1ee0 [ 414.391473] ? __pfx_blk_mq_dispatch_rq_list+0x10/0x10 [ 414.392070] ? sbitmap_get+0x2b8/0x450 [ 414.392533] ? __blk_mq_get_driver_tag+0x210/0x9f0 [ 414.393095] __blk_mq_sched_dispatch_requests+0xd99/0x1690 [ 414.393730] ? elv_attempt_insert_merge+0x1b1/0x420 [ 414.394302] ? __pfx___blk_mq_sched_dispatch_requests+0x10/0x10 [ 414.394970] ? lock_acquire+0x18d/0x460 [ 414.395456] ? blk_mq_run_hw_queue+0x637/0xa00 [ 414.395986] ? __pfx_lock_acquire+0x10/0x10 [ 414.396499] blk_mq_sched_dispatch_requests+0x109/0x190 [ 414.397100] blk_mq_run_hw_queue+0x66e/0xa00 [ 414.397616] blk_mq_flush_plug_list.part.17+0x614/0x2030 [ 414.398244] ? __pfx_blk_mq_flush_plug_list.part.17+0x10/0x10 [ 414.398897] ? writeback_sb_inodes+0x241/0xcc0 [ 414.399429] blk_mq_flush_plug_list+0x65/0x80 [ 414.399957] __blk_flush_plug+0x2f1/0x530 [ 414.400458] ? __pfx___blk_flush_plug+0x10/0x10 [ 414.400999] blk_finish_plug+0x59/0xa0 [ 414.401467] wb_writeback+0x7cc/0x920 [ 414.401935] ? __pfx_wb_writeback+0x10/0x10 [ 414.402442] ? mark_held_locks+0xb0/0x110 [ 414.402931] ? __pfx_do_raw_spin_lock+0x10/0x10 [ 414.403462] ? lockdep_hardirqs_on_prepare+0x297/0x3f0 [ 414.404062] wb_workfn+0x2b3/0xcf0 [ 414.404500] ? __pfx_wb_workfn+0x10/0x10 [ 414.404989] process_scheduled_works+0x432/0x13f0 [ 414.405546] ? __pfx_process_scheduled_works+0x10/0x10 [ 414.406139] ? do_raw_spin_lock+0x101/0x2a0 [ 414.406641] ? assign_work+0x19b/0x240 [ 414.407106] ? lock_is_held_type+0x9d/0x110 [ 414.407604] worker_thread+0x6f2/0x1160 [ 414.408075] ? __kthread_parkme+0x62/0x210 [ 414.408572] ? lockdep_hardirqs_on_prepare+0x297/0x3f0 [ 414.409168] ? __kthread_parkme+0x13c/0x210 [ 414.409678] ? __pfx_worker_thread+0x10/0x10 [ 414.410191] kthread+0x33c/0x440 [ 414.410602] ? __pfx_kthread+0x10/0x10 [ 414.411068] ret_from_fork+0x4d/0x80 [ 414.411526] ? __pfx_kthread+0x10/0x10 [ 414.411993] ret_from_fork_asm+0x1b/0x30 [ 414.412489] When interrupt is turned on while a lock holding by spin_lock_irq it throws a warning because of potential deadlock. blk_mq_prep_dispatch_rq blk_mq_get_driver_tag __blk_mq_get_driver_tag __blk_mq_alloc_driver_tag blk_mq_tag_busy -> tag is already busy // failed to get driver tag blk_mq_mark_tag_wait spin_lock_irq(&wq->lock) -> lock A (&sbq->ws[i].wait) __add_wait_queue(wq, wait) -> wait queue active blk_mq_get_driver_tag __blk_mq_tag_busy -> 1) tag must be idle, which means there can't be inflight IO spin_lock_irq(&tags->lock) -> lock B (hctx->tags) spin_unlock_irq(&tags->lock) -> unlock B, turn on interrupt accidentally -> 2) context must be preempt by IO interrupt to trigger deadlock. As shown above, the deadlock is not possible in theory, but the warning still need to be fixed. Fix it by using spin_lock_irqsave to get lockB instead of spin_lock_irq. Fixes: 4f1731df60f9 ("blk-mq: fix potential io hang by wrong 'wake_batch'") Signed-off-by: Li Lingfeng Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240815024736.2040971-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index cc57e2dd9a0b..2cafcf11ee8b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -38,6 +38,7 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; + unsigned long flags; struct blk_mq_tags *tags = hctx->tags; /* @@ -56,11 +57,11 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) return; } - spin_lock_irq(&tags->lock); + spin_lock_irqsave(&tags->lock, flags); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); - spin_unlock_irq(&tags->lock); + spin_unlock_irqrestore(&tags->lock, flags); } /* From e46bc2e7eb90a370bc27fa2fd98cb8251e7da1ec Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 7 Aug 2024 18:33:35 +0100 Subject: [PATCH 0946/1052] mseal: fix is_madv_discard() is_madv_discard did its check wrong. MADV_ flags are not bitwise, they're normal sequential numbers. So, for instance: behavior & (/* ... */ | MADV_REMOVE) tagged both MADV_REMOVE and MADV_RANDOM (bit 0 set) as discard operations. As a result the kernel could erroneously block certain madvises (e.g MADV_RANDOM or MADV_HUGEPAGE) on sealed VMAs due to them sharing bits with blocked MADV operations (e.g REMOVE or WIPEONFORK). This is obviously incorrect, so use a switch statement instead. Link: https://lkml.kernel.org/r/20240807173336.2523757-1-pedro.falcato@gmail.com Link: https://lkml.kernel.org/r/20240807173336.2523757-2-pedro.falcato@gmail.com Fixes: 8be7258aad44 ("mseal: add mseal syscall") Signed-off-by: Pedro Falcato Tested-by: Jeff Xu Reviewed-by: Jeff Xu Cc: Kees Cook Cc: Liam R. Howlett Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- mm/mseal.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/mseal.c b/mm/mseal.c index bf783bba8ed0..15bba28acc00 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -40,9 +40,17 @@ static bool can_modify_vma(struct vm_area_struct *vma) static bool is_madv_discard(int behavior) { - return behavior & - (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | - MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); + switch (behavior) { + case MADV_FREE: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_REMOVE: + case MADV_DONTFORK: + case MADV_WIPEONFORK: + return true; + } + + return false; } static bool is_ro_anon(struct vm_area_struct *vma) From 5f75cfbd6bb02295ddaed48adf667b6c828ce07b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 1 Aug 2024 22:47:48 +0200 Subject: [PATCH 0947/1052] mm/hugetlb: fix hugetlb vs. core-mm PT locking We recently made GUP's common page table walking code to also walk hugetlb VMAs without most hugetlb special-casing, preparing for the future of having less hugetlb-specific page table walking code in the codebase. Turns out that we missed one page table locking detail: page table locking for hugetlb folios that are not mapped using a single PMD/PUD. Assume we have hugetlb folio that spans multiple PTEs (e.g., 64 KiB hugetlb folios on arm64 with 4 KiB base page size). GUP, as it walks the page tables, will perform a pte_offset_map_lock() to grab the PTE table lock. However, hugetlb that concurrently modifies these page tables would actually grab the mm->page_table_lock: with USE_SPLIT_PTE_PTLOCKS, the locks would differ. Something similar can happen right now with hugetlb folios that span multiple PMDs when USE_SPLIT_PMD_PTLOCKS. This issue can be reproduced [1], for example triggering: [ 3105.936100] ------------[ cut here ]------------ [ 3105.939323] WARNING: CPU: 31 PID: 2732 at mm/gup.c:142 try_grab_folio+0x11c/0x188 [ 3105.944634] Modules linked in: [...] [ 3105.974841] CPU: 31 PID: 2732 Comm: reproducer Not tainted 6.10.0-64.eln141.aarch64 #1 [ 3105.980406] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-4.fc40 05/24/2024 [ 3105.986185] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 3105.991108] pc : try_grab_folio+0x11c/0x188 [ 3105.994013] lr : follow_page_pte+0xd8/0x430 [ 3105.996986] sp : ffff80008eafb8f0 [ 3105.999346] x29: ffff80008eafb900 x28: ffffffe8d481f380 x27: 00f80001207cff43 [ 3106.004414] x26: 0000000000000001 x25: 0000000000000000 x24: ffff80008eafba48 [ 3106.009520] x23: 0000ffff9372f000 x22: ffff7a54459e2000 x21: ffff7a546c1aa978 [ 3106.014529] x20: ffffffe8d481f3c0 x19: 0000000000610041 x18: 0000000000000001 [ 3106.019506] x17: 0000000000000001 x16: ffffffffffffffff x15: 0000000000000000 [ 3106.024494] x14: ffffb85477fdfe08 x13: 0000ffff9372ffff x12: 0000000000000000 [ 3106.029469] x11: 1fffef4a88a96be1 x10: ffff7a54454b5f0c x9 : ffffb854771b12f0 [ 3106.034324] x8 : 0008000000000000 x7 : ffff7a546c1aa980 x6 : 0008000000000080 [ 3106.038902] x5 : 00000000001207cf x4 : 0000ffff9372f000 x3 : ffffffe8d481f000 [ 3106.043420] x2 : 0000000000610041 x1 : 0000000000000001 x0 : 0000000000000000 [ 3106.047957] Call trace: [ 3106.049522] try_grab_folio+0x11c/0x188 [ 3106.051996] follow_pmd_mask.constprop.0.isra.0+0x150/0x2e0 [ 3106.055527] follow_page_mask+0x1a0/0x2b8 [ 3106.058118] __get_user_pages+0xf0/0x348 [ 3106.060647] faultin_page_range+0xb0/0x360 [ 3106.063651] do_madvise+0x340/0x598 Let's make huge_pte_lockptr() effectively use the same PT locks as any core-mm page table walker would. Add ptep_lockptr() to obtain the PTE page table lock using a pte pointer -- unfortunately we cannot convert pte_lockptr() because virt_to_page() doesn't work with kmap'ed page tables we can have with CONFIG_HIGHPTE. Handle CONFIG_PGTABLE_LEVELS correctly by checking in reverse order, such that when e.g., CONFIG_PGTABLE_LEVELS==2 with PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE will work as expected. Document why that works. There is one ugly case: powerpc 8xx, whereby we have an 8 MiB hugetlb folio being mapped using two PTE page tables. While hugetlb wants to take the PMD table lock, core-mm would grab the PTE table lock of one of both PTE page tables. In such corner cases, we have to make sure that both locks match, which is (fortunately!) currently guaranteed for 8xx as it does not support SMP and consequently doesn't use split PT locks. [1] https://lore.kernel.org/all/1bbfcc7f-f222-45a5-ac44-c5a1381c596d@redhat.com/ Link: https://lkml.kernel.org/r/20240801204748.99107-1-david@redhat.com Fixes: 9cb28da54643 ("mm/gup: handle hugetlb in the generic follow_page_mask code") Signed-off-by: David Hildenbrand Acked-by: Peter Xu Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Peter Xu Cc: Oscar Salvador Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 33 ++++++++++++++++++++++++++++++--- include/linux/mm.h | 11 +++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c9bf68c239a0..45bf05ad5c53 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -944,10 +944,37 @@ static inline bool htlb_allow_alloc_fallback(int reason) static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { - if (huge_page_size(h) == PMD_SIZE) + const unsigned long size = huge_page_size(h); + + VM_WARN_ON(size == PAGE_SIZE); + + /* + * hugetlb must use the exact same PT locks as core-mm page table + * walkers would. When modifying a PTE table, hugetlb must take the + * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD + * PT lock etc. + * + * The expectation is that any hugetlb folio smaller than a PMD is + * always mapped into a single PTE table and that any hugetlb folio + * smaller than a PUD (but at least as big as a PMD) is always mapped + * into a single PMD table. + * + * If that does not hold for an architecture, then that architecture + * must disable split PT locks such that all *_lockptr() functions + * will give us the same result: the per-MM PT lock. + * + * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where + * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() + * and core-mm would use pmd_lockptr(). However, in such configurations + * split PMD locks are disabled -- they don't make sense on a single + * PGDIR page table -- and the end result is the same. + */ + if (size >= PUD_SIZE) + return pud_lockptr(mm, (pud_t *) pte); + else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) return pmd_lockptr(mm, (pmd_t *) pte); - VM_BUG_ON(huge_page_size(h) == PAGE_SIZE); - return &mm->page_table_lock; + /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ + return ptep_lockptr(mm, pte); } #ifndef hugepages_supported diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..6549d0979b28 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2920,6 +2920,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); + BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); + return ptlock_ptr(virt_to_ptdesc(pte)); +} + static inline bool ptlock_init(struct ptdesc *ptdesc) { /* @@ -2944,6 +2951,10 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} From ace0741a55e453c265cbf3d965eea7f687cd6d45 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:34 +0000 Subject: [PATCH 0948/1052] mm: don't account memmap on failure Patch series "Fixes for memmap accounting", v4. Memmap accounting provides us with observability of how much memory is used for per-page metadata: i.e. "struct page"'s and "struct page_ext". It also provides with information of how much was allocated using boot allocator (i.e. not part of MemTotal), and how much was allocated using buddy allocated (i.e. part of MemTotal). This small series fixes a few problems that were discovered with the original patch. This patch (of 3): When we fail to allocate the mmemmap in alloc_vmemmap_page_list(), do not account any already-allocated pages: we're going to free all them before we return from the function. Link: https://lkml.kernel.org/r/20240809191020.1142142-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-2-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Reviewed-by: Fan Ni Reviewed-by: Yosry Ahmed Acked-by: David Hildenbrand Tested-by: Alison Schofield Reviewed-by: Muchun Song Acked-by: David Rientjes Cc: Dan Williams Cc: Domenico Cerasuolo Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yi Zhang Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 829112b0a914..4f51e0596197 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -392,13 +392,10 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(nid, gfp_mask, 0); - if (!page) { - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i); + if (!page) goto out; - } list_add(&page->lru, list); } - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages); return 0; From f4cb78af91e3b2b7aa76dbf8213b898fa8811b12 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:35 +0000 Subject: [PATCH 0949/1052] mm: add system wide stats items category /proc/vmstat contains events and stats, events can only grow, but stats can grow and shrink. vmstat has the following: ------------------------- NR_VM_ZONE_STAT_ITEMS: per-zone stats NR_VM_NUMA_EVENT_ITEMS: per-numa events NR_VM_NODE_STAT_ITEMS: per-numa stats NR_VM_WRITEBACK_STAT_ITEMS: system-wide background-writeback and dirty-throttling tresholds. NR_VM_EVENT_ITEMS: system-wide events ------------------------- Rename NR_VM_WRITEBACK_STAT_ITEMS to NR_VM_STAT_ITEMS, to track the system-wide stats, we are going to add per-page metadata stats to this category in the next patch. Also delete unused writeback_stat_name(). Link: https://lkml.kernel.org/r/20240809191020.1142142-2-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-3-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Suggested-by: Yosry Ahmed Tested-by: Alison Schofield Acked-by: David Hildenbrand Acked-by: David Rientjes Cc: Dan Williams Cc: Domenico Cerasuolo Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yi Zhang Cc: Fan Ni Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 15 ++++----------- mm/vmstat.c | 6 +++--- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 23cd17942036..9ab4fa5e09b5 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -34,10 +34,11 @@ struct reclaim_stat { unsigned nr_lazyfree_fail; }; -enum writeback_stat_item { +/* Stat data for system wide items */ +enum vm_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, - NR_VM_WRITEBACK_STAT_ITEMS, + NR_VM_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS @@ -514,21 +515,13 @@ static inline const char *lru_list_name(enum lru_list lru) return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } -static inline const char *writeback_stat_name(enum writeback_stat_item item) -{ - return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_EVENT_ITEMS + - NR_VM_NODE_STAT_ITEMS + - item]; -} - #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + - NR_VM_WRITEBACK_STAT_ITEMS + + NR_VM_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 04a1cb6cc636..6f8aa4766f16 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1257,7 +1257,7 @@ const char * const vmstat_text[] = { "pgdemote_khugepaged", "nr_memmap", "nr_memmap_boot", - /* enum writeback_stat_item counters */ + /* system-wide enum vm_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -1790,7 +1790,7 @@ static const struct seq_operations zoneinfo_op = { #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ NR_VM_NUMA_EVENT_ITEMS + \ NR_VM_NODE_STAT_ITEMS + \ - NR_VM_WRITEBACK_STAT_ITEMS + \ + NR_VM_STAT_ITEMS + \ (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ NR_VM_EVENT_ITEMS : 0)) @@ -1827,7 +1827,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); - v += NR_VM_WRITEBACK_STAT_ITEMS; + v += NR_VM_STAT_ITEMS; #ifdef CONFIG_VM_EVENT_COUNTERS all_vm_events(v); From 9d85731110241fb8ca9445ea4177d816041a8825 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:36 +0000 Subject: [PATCH 0950/1052] mm: don't account memmap per-node Fix invalid access to pgdat during hot-remove operation: ndctl users reported a GPF when trying to destroy a namespace: $ ndctl destroy-namespace all -r all -f Segmentation fault dmesg: Oops: general protection fault, probably for non-canonical address 0xdffffc0000005650: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: probably user-memory-access in range [0x000000000002b280-0x000000000002b287] CPU: 26 UID: 0 PID: 1868 Comm: ndctl Not tainted 6.11.0-rc1 #1 Hardware name: Dell Inc. PowerEdge R640/08HT8T, BIOS 2.20.1 09/13/2023 RIP: 0010:mod_node_page_state+0x2a/0x110 cxl-test users report a GPF when trying to unload the test module: $ modrpobe -r cxl-test dmesg BUG: unable to handle page fault for address: 0000000000004200 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 UID: 0 PID: 1076 Comm: modprobe Tainted: G O N 6.11.0-rc1 #197 Tainted: [O]=OOT_MODULE, [N]=TEST Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/15 RIP: 0010:mod_node_page_state+0x6/0x90 Currently, when memory is hot-plugged or hot-removed the accounting is done based on the assumption that memmap is allocated from the same node as the hot-plugged/hot-removed memory, which is not always the case. In addition, there are challenges with keeping the node id of the memory that is being remove to the time when memmap accounting is actually performed: since this is done after remove_pfn_range_from_zone(), and also after remove_memory_block_devices(). Meaning that we cannot use pgdat nor walking though memblocks to get the nid. Given all of that, account the memmap overhead system wide instead. For this we are going to be using global atomic counters, but given that memmap size is rarely modified, and normally is only modified either during early boot when there is only one CPU, or under a hotplug global mutex lock, therefore there is no need for per-cpu optimizations. Also, while we are here rename nr_memmap to nr_memmap_pages, and nr_memmap_boot to nr_memmap_boot_pages to be self explanatory that the units are in page count. [pasha.tatashin@soleen.com: address a few nits from David Hildenbrand] Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-4-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Reported-by: Yi Zhang Closes: https://lore.kernel.org/linux-cxl/CAHj4cs9Ax1=CoJkgBGP_+sNu6-6=6v=_L-ZBZY0bVLD3wUWZQg@mail.gmail.com Reported-by: Alison Schofield Closes: https://lore.kernel.org/linux-mm/Zq0tPd2h6alFz8XF@aschofie-mobl2/#t Tested-by: Dan Williams Tested-by: Alison Schofield Acked-by: David Hildenbrand Acked-by: David Rientjes Tested-by: Yi Zhang Cc: Domenico Cerasuolo Cc: Fan Ni Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- include/linux/vmstat.h | 7 ++++--- mm/hugetlb_vmemmap.c | 8 ++++---- mm/mm_init.c | 3 +-- mm/page_alloc.c | 1 - mm/page_ext.c | 18 ++++------------- mm/sparse-vmemmap.c | 11 ++++------ mm/sparse.c | 5 ++--- mm/vmstat.c | 46 ++++++++++++++++++++---------------------- 9 files changed, 41 insertions(+), 60 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 41458892bc8a..1dc6248feb83 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,8 +220,6 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, - NR_MEMMAP, /* page metadata allocated through buddy allocator */ - NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9ab4fa5e09b5..9eb77c9007e6 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -38,6 +38,8 @@ struct reclaim_stat { enum vm_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, + NR_MEMMAP_PAGES, /* page metadata allocated through buddy allocator */ + NR_MEMMAP_BOOT_PAGES, /* page metadata allocated through boot allocator */ NR_VM_STAT_ITEMS, }; @@ -618,7 +620,6 @@ static inline void lruvec_stat_sub_folio(struct folio *folio, lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -void __meminit mod_node_early_perpage_metadata(int nid, long delta); -void __meminit store_early_perpage_metadata(void); - +void memmap_boot_pages_add(long delta); +void memmap_pages_add(long delta); #endif /* _LINUX_VMSTAT_H */ diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4f51e0596197..0c3f56b3578e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -185,11 +185,11 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) { + memmap_boot_pages_add(-1); free_bootmem_page(page); - mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1); } else { + memmap_pages_add(-1); __free_page(page); - mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1); } } @@ -341,7 +341,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1); + memmap_pages_add(1); } /* @@ -396,7 +396,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, goto out; list_add(&page->lru, list); } - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages); + memmap_pages_add(nr_pages); return 0; out: diff --git a/mm/mm_init.c b/mm/mm_init.c index 75c3bd42799b..f9a60ffc5532 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1623,8 +1623,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); pgdat->node_mem_map = map + offset; - mod_node_early_perpage_metadata(pgdat->node_id, - DIV_ROUND_UP(size, PAGE_SIZE)); + memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 28f80daf5c04..875d76e8684a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5755,7 +5755,6 @@ void __init setup_per_cpu_pageset(void) for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); - store_early_perpage_metadata(); } __meminit void zone_pcp_init(struct zone *zone) diff --git a/mm/page_ext.c b/mm/page_ext.c index c191e490c401..641d93f6af4c 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -214,8 +214,7 @@ static int __init alloc_node_page_ext(int nid) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT, - DIV_ROUND_UP(table_size, PAGE_SIZE)); + memmap_boot_pages_add(DIV_ROUND_UP(table_size, PAGE_SIZE)); return 0; } @@ -275,10 +274,8 @@ static void *__meminit alloc_page_ext(size_t size, int nid) else addr = vzalloc_node(size, nid); - if (addr) { - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, - DIV_ROUND_UP(size, PAGE_SIZE)); - } + if (addr) + memmap_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); return addr; } @@ -323,25 +320,18 @@ static void free_page_ext(void *addr) { size_t table_size; struct page *page; - struct pglist_data *pgdat; table_size = page_ext_size * PAGES_PER_SECTION; + memmap_pages_add(-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); if (is_vmalloc_addr(addr)) { - page = vmalloc_to_page(addr); - pgdat = page_pgdat(page); vfree(addr); } else { page = virt_to_page(addr); - pgdat = page_pgdat(page); BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } - - mod_node_page_state(pgdat, NR_MEMMAP, - -1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); - } static void __free_page_ext(unsigned long pfn) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1dda6c53370b..edcc7a6b0f6f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -469,13 +469,10 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, if (r < 0) return NULL; - if (system_state == SYSTEM_BOOTING) { - mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(end - start, - PAGE_SIZE)); - } else { - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, - DIV_ROUND_UP(end - start, PAGE_SIZE)); - } + if (system_state == SYSTEM_BOOTING) + memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); + else + memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index e4b830091d13..0f018c6f9ec5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -463,7 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; #ifndef CONFIG_SPARSEMEM_VMEMMAP - mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(size, PAGE_SIZE)); + memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); #endif } @@ -643,8 +643,7 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); - mod_node_page_state(page_pgdat(pfn_to_page(pfn)), NR_MEMMAP, - -1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); + memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); vmemmap_free(start, end, altmap); } static void free_map_bootmem(struct page *memmap) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6f8aa4766f16..e875f2a4915f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1033,6 +1033,24 @@ unsigned long node_page_state(struct pglist_data *pgdat, } #endif +/* + * Count number of pages "struct page" and "struct page_ext" consume. + * nr_memmap_boot_pages: # of pages allocated by boot allocator + * nr_memmap_pages: # of pages that were allocated by buddy allocator + */ +static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(0); +static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(0); + +void memmap_boot_pages_add(long delta) +{ + atomic_long_add(delta, &nr_memmap_boot_pages); +} + +void memmap_pages_add(long delta) +{ + atomic_long_add(delta, &nr_memmap_pages); +} + #ifdef CONFIG_COMPACTION struct contig_page_info { @@ -1255,11 +1273,11 @@ const char * const vmstat_text[] = { "pgdemote_kswapd", "pgdemote_direct", "pgdemote_khugepaged", - "nr_memmap", - "nr_memmap_boot", /* system-wide enum vm_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", + "nr_memmap_pages", + "nr_memmap_boot_pages", #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) /* enum vm_event_item counters */ @@ -1827,6 +1845,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); + v[NR_MEMMAP_PAGES] = atomic_long_read(&nr_memmap_pages); + v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(&nr_memmap_boot_pages); v += NR_VM_STAT_ITEMS; #ifdef CONFIG_VM_EVENT_COUNTERS @@ -2285,25 +2305,3 @@ static int __init extfrag_debug_init(void) module_init(extfrag_debug_init); #endif - -/* - * Page metadata size (struct page and page_ext) in pages - */ -static unsigned long early_perpage_metadata[MAX_NUMNODES] __meminitdata; - -void __meminit mod_node_early_perpage_metadata(int nid, long delta) -{ - early_perpage_metadata[nid] += delta; -} - -void __meminit store_early_perpage_metadata(void) -{ - int nid; - struct pglist_data *pgdat; - - for_each_online_pgdat(pgdat) { - nid = pgdat->node_id; - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT, - early_perpage_metadata[nid]); - } -} From d75abd0d0bc29e6ebfebbf76d11b4067b35844af Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Aug 2024 12:41:07 -0400 Subject: [PATCH 0951/1052] mm/memory-failure: use raw_spinlock_t in struct memory_failure_cpu The memory_failure_cpu structure is a per-cpu structure. Access to its content requires the use of get_cpu_var() to lock in the current CPU and disable preemption. The use of a regular spinlock_t for locking purpose is fine for a non-RT kernel. Since the integration of RT spinlock support into the v5.15 kernel, a spinlock_t in a RT kernel becomes a sleeping lock and taking a sleeping lock in a preemption disabled context is illegal resulting in the following kind of warning. [12135.732244] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [12135.732248] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 270076, name: kworker/0:0 [12135.732252] preempt_count: 1, expected: 0 [12135.732255] RCU nest depth: 2, expected: 2 : [12135.732420] Hardware name: Dell Inc. PowerEdge R640/0HG0J8, BIOS 2.10.2 02/24/2021 [12135.732423] Workqueue: kacpi_notify acpi_os_execute_deferred [12135.732433] Call Trace: [12135.732436] [12135.732450] dump_stack_lvl+0x57/0x81 [12135.732461] __might_resched.cold+0xf4/0x12f [12135.732479] rt_spin_lock+0x4c/0x100 [12135.732491] memory_failure_queue+0x40/0xe0 [12135.732503] ghes_do_memory_failure+0x53/0x390 [12135.732516] ghes_do_proc.constprop.0+0x229/0x3e0 [12135.732575] ghes_proc+0xf9/0x1a0 [12135.732591] ghes_notify_hed+0x6a/0x150 [12135.732602] notifier_call_chain+0x43/0xb0 [12135.732626] blocking_notifier_call_chain+0x43/0x60 [12135.732637] acpi_ev_notify_dispatch+0x47/0x70 [12135.732648] acpi_os_execute_deferred+0x13/0x20 [12135.732654] process_one_work+0x41f/0x500 [12135.732695] worker_thread+0x192/0x360 [12135.732715] kthread+0x111/0x140 [12135.732733] ret_from_fork+0x29/0x50 [12135.732779] Fix it by using a raw_spinlock_t for locking instead. Also move the pr_err() out of the lock critical section and after put_cpu_ptr() to avoid indeterminate latency and the possibility of sleep with this call. [longman@redhat.com: don't hold percpu ref across pr_err(), per Miaohe] Link: https://lkml.kernel.org/r/20240807181130.1122660-1-longman@redhat.com Link: https://lkml.kernel.org/r/20240806164107.1044956-1-longman@redhat.com Fixes: 0f383b6dc96e ("locking/spinlock: Provide RT variant") Signed-off-by: Waiman Long Acked-by: Miaohe Lin Cc: "Huang, Ying" Cc: Juri Lelli Cc: Len Brown Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 581d3e5c9117..7066fc84f351 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2417,7 +2417,7 @@ struct memory_failure_entry { struct memory_failure_cpu { DECLARE_KFIFO(fifo, struct memory_failure_entry, MEMORY_FAILURE_FIFO_SIZE); - spinlock_t lock; + raw_spinlock_t lock; struct work_struct work; }; @@ -2443,20 +2443,22 @@ void memory_failure_queue(unsigned long pfn, int flags) { struct memory_failure_cpu *mf_cpu; unsigned long proc_flags; + bool buffer_overflow; struct memory_failure_entry entry = { .pfn = pfn, .flags = flags, }; mf_cpu = &get_cpu_var(memory_failure_cpu); - spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, entry)) + raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); + buffer_overflow = !kfifo_put(&mf_cpu->fifo, entry); + if (!buffer_overflow) schedule_work_on(smp_processor_id(), &mf_cpu->work); - else + raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); + if (buffer_overflow) pr_err("buffer overflow when queuing memory failure at %#lx\n", pfn); - spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); - put_cpu_var(memory_failure_cpu); } EXPORT_SYMBOL_GPL(memory_failure_queue); @@ -2469,9 +2471,9 @@ static void memory_failure_work_func(struct work_struct *work) mf_cpu = container_of(work, struct memory_failure_cpu, work); for (;;) { - spin_lock_irqsave(&mf_cpu->lock, proc_flags); + raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); - spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; if (entry.flags & MF_SOFT_OFFLINE) @@ -2501,7 +2503,7 @@ static int __init memory_failure_init(void) for_each_possible_cpu(cpu) { mf_cpu = &per_cpu(memory_failure_cpu, cpu); - spin_lock_init(&mf_cpu->lock); + raw_spin_lock_init(&mf_cpu->lock); INIT_KFIFO(mf_cpu->fifo); INIT_WORK(&mf_cpu->work, memory_failure_work_func); } From 61ebe5a747da649057c37be1c37eb934b4af79ca Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Thu, 8 Aug 2024 20:19:56 +0800 Subject: [PATCH 0952/1052] mm/vmalloc: fix page mapping if vm_area_alloc_pages() with high order fallback to order 0 The __vmap_pages_range_noflush() assumes its argument pages** contains pages with the same page shift. However, since commit e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations"), if gfp_flags includes __GFP_NOFAIL with high order in vm_area_alloc_pages() and page allocation failed for high order, the pages** may contain two different page shifts (high order and order-0). This could lead __vmap_pages_range_noflush() to perform incorrect mappings, potentially resulting in memory corruption. Users might encounter this as follows (vmap_allow_huge = true, 2M is for PMD_SIZE): kvmalloc(2M, __GFP_NOFAIL|GFP_X) __vmalloc_node_range_noprof(vm_flags=VM_ALLOW_HUGE_VMAP) vm_area_alloc_pages(order=9) ---> order-9 allocation failed and fallback to order-0 vmap_pages_range() vmap_pages_range_noflush() __vmap_pages_range_noflush(page_shift = 21) ----> wrong mapping happens We can remove the fallback code because if a high-order allocation fails, __vmalloc_node_range_noprof() will retry with order-0. Therefore, it is unnecessary to fallback to order-0 here. Therefore, fix this by removing the fallback code. Link: https://lkml.kernel.org/r/20240808122019.3361-1-hailong.liu@oppo.com Fixes: e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations") Signed-off-by: Hailong Liu Reported-by: Tangquan Zheng Reviewed-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Barry Song Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b783baf12a1..af2de36549d6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3584,15 +3584,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, page = alloc_pages_noprof(alloc_gfp, order); else page = alloc_pages_node_noprof(nid, alloc_gfp, order); - if (unlikely(!page)) { - if (!nofail) - break; - - /* fall back to the zero order allocations */ - alloc_gfp |= __GFP_NOFAIL; - order = 0; - continue; - } + if (unlikely(!page)) + break; /* * Higher order allocations must be able to be treated as From 40b760cfd44566bca791c80e0720d70d75382b84 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 9 Aug 2024 10:59:04 -0400 Subject: [PATCH 0953/1052] mm/numa: no task_numa_fault() call if PTE is changed When handling a numa page fault, task_numa_fault() should be called by a process that restores the page table of the faulted folio to avoid duplicated stats counting. Commit b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault") restructured do_numa_page() and did not avoid task_numa_fault() call in the second page table check after a numa migration failure. Fix it by making all !pte_same() return immediately. This issue can cause task_numa_fault() being called more than necessary and lead to unexpected numa balancing results (It is hard to tell whether the issue will cause positive or negative performance impact due to duplicated numa fault counting). Link: https://lkml.kernel.org/r/20240809145906.1513458-2-ziy@nvidia.com Fixes: b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault") Signed-off-by: Zi Yan Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Kefeng Wang Cc: Mel Gorman Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 34f8402d2046..3c01d68065be 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5295,7 +5295,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); @@ -5358,23 +5358,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!migrate_misplaced_folio(folio, vma, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; - } else { - flags |= TNF_MIGRATE_FAIL; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte)) - goto out; - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; - } - goto out_map; + task_numa_fault(last_cpupid, nid, nr_pages, flags); + return 0; } -out: - if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, nr_pages, flags); - return 0; + flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + return 0; + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } out_map: /* * Make it present again, depending on how arch implements @@ -5387,7 +5383,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, nr_pages, flags); + return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) From fd8c35a92910f4829b7c99841f39b1b952c259d5 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 9 Aug 2024 10:59:05 -0400 Subject: [PATCH 0954/1052] mm/numa: no task_numa_fault() call if PMD is changed When handling a numa page fault, task_numa_fault() should be called by a process that restores the page table of the faulted folio to avoid duplicated stats counting. Commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") restructured do_huge_pmd_numa_page() and did not avoid task_numa_fault() call in the second page table check after a numa migration failure. Fix it by making all !pmd_same() return immediately. This issue can cause task_numa_fault() being called more than necessary and lead to unexpected numa balancing results (It is hard to tell whether the issue will cause positive or negative performance impact due to duplicated numa fault counting). Link: https://lkml.kernel.org/r/20240809145906.1513458-3-ziy@nvidia.com Fixes: c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4be468e06a4..67c86a5d64a6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1685,7 +1685,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); - goto out; + return 0; } pmd = pmd_modify(oldpmd, vma->vm_page_prot); @@ -1728,22 +1728,16 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) if (!migrate_misplaced_folio(folio, vma, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; - } else { - flags |= TNF_MIGRATE_FAIL; - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { - spin_unlock(vmf->ptl); - goto out; - } - goto out_map; + task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); + return 0; } -out: - if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); - - return 0; - + flags |= TNF_MIGRATE_FAIL; + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); + return 0; + } out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); @@ -1753,7 +1747,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - goto out; + + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); + return 0; } /* From af3b7d09a9934220a8136065a0e6985fe0b67a1b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 9 Aug 2024 15:32:30 +0300 Subject: [PATCH 0955/1052] selftests/mm: compaction_test: fix off by one in check_compaction() The "initial_nr_hugepages" variable is unsigned long so it takes up to 20 characters to print, plus 1 more character for the NUL terminator. Unfortunately, this buffer is not quite large enough for the terminator to fit. Also use snprintf() for a belt and suspenders approach. Link: https://lkml.kernel.org/r/87470c06-b45a-4e83-92ff-aac2e7b9c6ba@stanley.mountain Fixes: fb9293b6b015 ("selftests/mm: compaction_test: fix bogus test success and reduce probability of OOM-killer invocation") Signed-off-by: Dan Carpenter Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/compaction_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index e140558e6f53..2c3a0eb6b22d 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -89,9 +89,10 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size, int fd, ret = -1; int compaction_index = 0; char nr_hugepages[20] = {0}; - char init_nr_hugepages[20] = {0}; + char init_nr_hugepages[24] = {0}; - sprintf(init_nr_hugepages, "%lu", initial_nr_hugepages); + snprintf(init_nr_hugepages, sizeof(init_nr_hugepages), + "%lu", initial_nr_hugepages); /* We want to test with 80% of available memory. Else, OOM killer comes in to play */ From 807174a93d24c456503692dc3f5af322ee0b640a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 9 Aug 2024 14:48:47 +0300 Subject: [PATCH 0956/1052] mm: fix endless reclaim on machines with unaccepted memory Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path. The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries. Make unaccepted memory free from watermark check point of view. This way unaccepted memory will never be the trigger of memory reclaim. Accept more memory in the get_page_from_freelist() if needed. Link: https://lkml.kernel.org/r/20240809114854.3745464-2-kirill.shutemov@linux.intel.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Signed-off-by: Kirill A. Shutemov Reported-by: Jianxiong Gao Acked-by: David Hildenbrand Tested-by: Jianxiong Gao Cc: Borislav Petkov Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport (Microsoft) Cc: Tom Lendacky Cc: Vlastimil Babka Cc: [6.5+] Signed-off-by: Andrew Morton --- mm/page_alloc.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 875d76e8684a..8747087acee3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,7 +287,7 @@ EXPORT_SYMBOL(nr_online_nodes); static bool page_contains_unaccepted(struct page *page, unsigned int order); static void accept_page(struct page *page, unsigned int order); -static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order); static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page); @@ -3072,9 +3072,6 @@ static inline long __zone_watermark_unusable_free(struct zone *z, if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif -#ifdef CONFIG_UNACCEPTED_MEMORY - unusable_free += zone_page_state(z, NR_UNACCEPTED); -#endif return unusable_free; } @@ -3368,6 +3365,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } } + cond_accept_memory(zone, order); + /* * Detect whether the number of free pages is below high * watermark. If so, we will decrease pcp->high and free @@ -3393,10 +3392,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, gfp_mask)) { int ret; - if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -3450,10 +3447,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, return page; } else { - if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ @@ -6950,9 +6945,6 @@ static bool try_to_accept_memory_one(struct zone *zone) struct page *page; bool last; - if (list_empty(&zone->unaccepted_pages)) - return false; - spin_lock_irqsave(&zone->lock, flags); page = list_first_entry_or_null(&zone->unaccepted_pages, struct page, lru); @@ -6978,23 +6970,29 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order) { long to_accept; - int ret = false; + bool ret = false; + + if (!has_unaccepted_memory()) + return false; + + if (list_empty(&zone->unaccepted_pages)) + return false; /* How much to accept to get to high watermark? */ to_accept = high_wmark_pages(zone) - (zone_page_state(zone, NR_FREE_PAGES) - - __zone_watermark_unusable_free(zone, order, 0)); + __zone_watermark_unusable_free(zone, order, 0) - + zone_page_state(zone, NR_UNACCEPTED)); - /* Accept at least one page */ - do { + while (to_accept > 0) { if (!try_to_accept_memory_one(zone)) break; ret = true; to_accept -= MAX_ORDER_NR_PAGES; - } while (to_accept > 0); + } return ret; } @@ -7037,7 +7035,7 @@ static void accept_page(struct page *page, unsigned int order) { } -static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order) { return false; } From 7c5e8d212d7d81991a580e7de3904ea213d9a852 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 9 Aug 2024 12:56:42 +0500 Subject: [PATCH 0957/1052] selftests: memfd_secret: don't build memfd_secret test on unsupported arches [1] mentions that memfd_secret is only supported on arm64, riscv, x86 and x86_64 for now. It doesn't support other architectures. I found the build error on arm and decided to send the fix as it was creating noise on KernelCI: memfd_secret.c: In function 'memfd_secret': memfd_secret.c:42:24: error: '__NR_memfd_secret' undeclared (first use in this function); did you mean 'memfd_secret'? 42 | return syscall(__NR_memfd_secret, flags); | ^~~~~~~~~~~~~~~~~ | memfd_secret Hence I'm adding condition that memfd_secret should only be compiled on supported architectures. Also check in run_vmtests script if memfd_secret binary is present before executing it. Link: https://lkml.kernel.org/r/20240812061522.1933054-1-usama.anjum@collabora.com Link: https://lore.kernel.org/all/20210518072034.31572-7-rppt@kernel.org/ [1] Link: https://lkml.kernel.org/r/20240809075642.403247-1-usama.anjum@collabora.com Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Shuah Khan Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: James Bottomley Cc: Mike Rapoport (Microsoft) Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 ++ tools/testing/selftests/mm/run_vmtests.sh | 3 +++ 2 files changed, 5 insertions(+) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 7b8a5def54a1..cfad627e8d94 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -53,7 +53,9 @@ TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate +ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64)) TEST_GEN_FILES += memfd_secret +endif TEST_GEN_FILES += migration TEST_GEN_FILES += mkdirty TEST_GEN_FILES += mlock-random-test diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 03ac4f2e1cce..36045edb10de 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -374,8 +374,11 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +if [ -x ./memfd_secret ] +then (echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix CATEGORY="memfd_secret" run_test ./memfd_secret +fi # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 CATEGORY="ksm" run_test ./ksm_tests -H -s 100 From edb907a6133323e19311901a39dee68b1c6a2ef8 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 12 Aug 2024 14:20:17 +0800 Subject: [PATCH 0958/1052] crash: fix riscv64 crash memory reserve dead loop On RISCV64 Qemu machine with 512MB memory, cmdline "crashkernel=500M,high" will cause system stall as below: Zone ranges: DMA32 [mem 0x0000000080000000-0x000000009fffffff] Normal empty Movable zone start for each node Early memory node ranges node 0: [mem 0x0000000080000000-0x000000008005ffff] node 0: [mem 0x0000000080060000-0x000000009fffffff] Initmem setup node 0 [mem 0x0000000080000000-0x000000009fffffff] (stall here) commit 5d99cadf1568 ("crash: fix x86_32 crash memory reserve dead loop bug") fix this on 32-bit architecture. However, the problem is not completely solved. If `CRASH_ADDR_LOW_MAX = CRASH_ADDR_HIGH_MAX` on 64-bit architecture, for example, when system memory is equal to CRASH_ADDR_LOW_MAX on RISCV64, the following infinite loop will also occur: -> reserve_crashkernel_generic() and high is true -> alloc at [CRASH_ADDR_LOW_MAX, CRASH_ADDR_HIGH_MAX] fail -> alloc at [0, CRASH_ADDR_LOW_MAX] fail and repeatedly (because CRASH_ADDR_LOW_MAX = CRASH_ADDR_HIGH_MAX). As Catalin suggested, do not remove the ",high" reservation fallback to ",low" logic which will change arm64's kdump behavior, but fix it by skipping the above situation similar to commit d2f32f23190b ("crash: fix x86_32 crash memory reserve dead loop"). After this patch, it print: cannot allocate crashkernel (size:0x1f400000) Link: https://lkml.kernel.org/r/20240812062017.2674441-1-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Acked-by: Baoquan He Cc: Albert Ou Cc: Dave Young Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_reserve.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index d3b4cd12bdd1..64d44a52c011 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -423,7 +423,8 @@ void __init reserve_crashkernel_generic(char *cmdline, if (high && search_end == CRASH_ADDR_HIGH_MAX) { search_end = CRASH_ADDR_LOW_MAX; search_base = 0; - goto retry; + if (search_end != CRASH_ADDR_HIGH_MAX) + goto retry; } pr_warn("cannot allocate crashkernel (size:0x%llx)\n", crash_size); From a8fc28dad6d574582cdf2f7e78c73c59c623df30 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Aug 2024 08:07:56 -0700 Subject: [PATCH 0959/1052] alloc_tag: introduce clear_page_tag_ref() helper function In several cases we are freeing pages which were not allocated using common page allocators. For such cases, in order to keep allocation accounting correct, we should clear the page tag to indicate that the page being freed is expected to not have a valid allocation tag. Introduce clear_page_tag_ref() helper function to be used for this. Link: https://lkml.kernel.org/r/20240813150758.855881-1-surenb@google.com Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") Signed-off-by: Suren Baghdasaryan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Pasha Tatashin Cc: Kees Cook Cc: Kent Overstreet Cc: Sourav Panda Cc: Vlastimil Babka Cc: [6.10] Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 13 +++++++++++++ mm/mm_init.c | 10 +--------- mm/page_alloc.c | 9 +-------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 18cd0c0c73d9..207f0c83c8e9 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -43,6 +43,18 @@ static inline void put_page_tag_ref(union codetag_ref *ref) page_ext_put(page_ext_from_codetag_ref(ref)); } +static inline void clear_page_tag_ref(struct page *page) +{ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } +} + static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) { @@ -126,6 +138,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } static inline void put_page_tag_ref(union codetag_ref *ref) {} +static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} diff --git a/mm/mm_init.c b/mm/mm_init.c index f9a60ffc5532..adc3127573cd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2459,15 +2459,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, } /* pages were reserved and not allocated */ - if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); - } - } - + clear_page_tag_ref(page); __free_pages_core(page, order, MEMINIT_EARLY); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8747087acee3..c565de8f48e9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5815,14 +5815,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char void free_reserved_page(struct page *page) { - if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); - } - } + clear_page_tag_ref(page); ClearPageReserved(page); init_page_count(page); __free_page(page); From 766c163c2068b45330664fb67df67268e588a22d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Aug 2024 08:07:57 -0700 Subject: [PATCH 0960/1052] alloc_tag: mark pages reserved during CMA activation as not tagged During CMA activation, pages in CMA area are prepared and then freed without being allocated. This triggers warnings when memory allocation debug config (CONFIG_MEM_ALLOC_PROFILING_DEBUG) is enabled. Fix this by marking these pages not tagged before freeing them. Link: https://lkml.kernel.org/r/20240813150758.855881-2-surenb@google.com Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") Signed-off-by: Suren Baghdasaryan Acked-by: David Hildenbrand Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Cc: [6.10] Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/mm_init.c b/mm/mm_init.c index adc3127573cd..51960079875b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2244,6 +2244,8 @@ void __init init_cma_reserved_pageblock(struct page *page) set_pageblock_migratetype(page, MIGRATE_CMA); set_page_refcounted(page); + /* pages were reserved and not allocated */ + clear_page_tag_ref(page); __free_pages(page, pageblock_order); adjust_managed_page_count(page, pageblock_nr_pages); From 2e6506e1c4eed2676a8412231046f31e10e240da Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 29 Jul 2024 10:13:06 +0800 Subject: [PATCH 0961/1052] mm/migrate: fix deadlock in migrate_pages_batch() on large folios Currently, migrate_pages_batch() can lock multiple locked folios with an arbitrary order. Although folio_trylock() is used to avoid deadlock as commit 2ef7dbb26990 ("migrate_pages: try migrate in batch asynchronously firstly") mentioned, it seems try_split_folio() is still missing. It was found by compaction stress test when I explicitly enable EROFS compressed files to use large folios, which case I cannot reproduce with the same workload if large folio support is off (current mainline). Typically, filesystem reads (with locked file-backed folios) could use another bdev/meta inode to load some other I/Os (e.g. inode extent metadata or caching compressed data), so the locking order will be: file-backed folios (A) bdev/meta folios (B) The following calltrace shows the deadlock: Thread 1 takes (B) lock and tries to take folio (A) lock Thread 2 takes (A) lock and tries to take folio (B) lock [Thread 1] INFO: task stress:1824 blocked for more than 30 seconds. Tainted: G OE 6.10.0-rc7+ #6 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:stress state:D stack:0 pid:1824 tgid:1824 ppid:1822 flags:0x0000000c Call trace: __switch_to+0xec/0x138 __schedule+0x43c/0xcb0 schedule+0x54/0x198 io_schedule+0x44/0x70 folio_wait_bit_common+0x184/0x3f8 <-- folio mapping ffff00036d69cb18 index 996 (**) __folio_lock+0x24/0x38 migrate_pages_batch+0x77c/0xea0 // try_split_folio (mm/migrate.c:1486:2) // migrate_pages_batch (mm/migrate.c:1734:16) <--- LIST_HEAD(unmap_folios) has .. folio mapping 0xffff0000d184f1d8 index 1711; (*) folio mapping 0xffff0000d184f1d8 index 1712; .. migrate_pages+0xb28/0xe90 compact_zone+0xa08/0x10f0 compact_node+0x9c/0x180 sysctl_compaction_handler+0x8c/0x118 proc_sys_call_handler+0x1a8/0x280 proc_sys_write+0x1c/0x30 vfs_write+0x240/0x380 ksys_write+0x78/0x118 __arm64_sys_write+0x24/0x38 invoke_syscall+0x78/0x108 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x3c/0x148 el0t_64_sync_handler+0x100/0x130 el0t_64_sync+0x190/0x198 [Thread 2] INFO: task stress:1825 blocked for more than 30 seconds. Tainted: G OE 6.10.0-rc7+ #6 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:stress state:D stack:0 pid:1825 tgid:1825 ppid:1822 flags:0x0000000c Call trace: __switch_to+0xec/0x138 __schedule+0x43c/0xcb0 schedule+0x54/0x198 io_schedule+0x44/0x70 folio_wait_bit_common+0x184/0x3f8 <-- folio = 0xfffffdffc6b503c0 (mapping == 0xffff0000d184f1d8 index == 1711) (*) __folio_lock+0x24/0x38 z_erofs_runqueue+0x384/0x9c0 [erofs] z_erofs_readahead+0x21c/0x350 [erofs] <-- folio mapping 0xffff00036d69cb18 range from [992, 1024] (**) read_pages+0x74/0x328 page_cache_ra_order+0x26c/0x348 ondemand_readahead+0x1c0/0x3a0 page_cache_sync_ra+0x9c/0xc0 filemap_get_pages+0xc4/0x708 filemap_read+0x104/0x3a8 generic_file_read_iter+0x4c/0x150 vfs_read+0x27c/0x330 ksys_pread64+0x84/0xd0 __arm64_sys_pread64+0x28/0x40 invoke_syscall+0x78/0x108 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x3c/0x148 el0t_64_sync_handler+0x100/0x130 el0t_64_sync+0x190/0x198 Link: https://lkml.kernel.org/r/20240729021306.398286-1-hsiangkao@linux.alibaba.com Fixes: 5dfab109d519 ("migrate_pages: batch _unmap and _move") Signed-off-by: Gao Xiang Reviewed-by: "Huang, Ying" Acked-by: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index e7296c0fb5d5..923ea80ba744 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1479,11 +1479,17 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, return rc; } -static inline int try_split_folio(struct folio *folio, struct list_head *split_folios) +static inline int try_split_folio(struct folio *folio, struct list_head *split_folios, + enum migrate_mode mode) { int rc; - folio_lock(folio); + if (mode == MIGRATE_ASYNC) { + if (!folio_trylock(folio)) + return -EAGAIN; + } else { + folio_lock(folio); + } rc = split_folio_to_list(folio, split_folios); folio_unlock(folio); if (!rc) @@ -1677,7 +1683,7 @@ static int migrate_pages_batch(struct list_head *from, */ if (nr_pages > 2 && !list_empty(&folio->_deferred_list)) { - if (try_split_folio(folio, split_folios) == 0) { + if (!try_split_folio(folio, split_folios, mode)) { nr_failed++; stats->nr_thp_failed += is_thp; stats->nr_thp_split += is_thp; @@ -1699,7 +1705,7 @@ static int migrate_pages_batch(struct list_head *from, if (!thp_migration_supported() && is_thp) { nr_failed++; stats->nr_thp_failed++; - if (!try_split_folio(folio, split_folios)) { + if (!try_split_folio(folio, split_folios, mode)) { stats->nr_thp_split++; stats->nr_split++; continue; @@ -1731,7 +1737,7 @@ static int migrate_pages_batch(struct list_head *from, stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (is_large && !nosplit) { - int ret = try_split_folio(folio, split_folios); + int ret = try_split_folio(folio, split_folios, mode); if (!ret) { stats->nr_thp_split += is_thp; From ad899c301c880766cc709aad277991b3ab671b66 Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Fri, 16 Aug 2024 10:01:59 +0300 Subject: [PATCH 0962/1052] char: xillybus: Refine workqueue handling As the wakeup work item now runs on a separate workqueue, it needs to be flushed separately along with flushing the device's workqueue. Also, move the destroy_workqueue() call to the end of the exit method, so that deinitialization is done in the opposite order of initialization. Fixes: ccbde4b128ef ("char: xillybus: Don't destroy workqueue from work item running on it") Cc: stable Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20240816070200.50695-1-eli.billauer@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillyusb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/char/xillybus/xillyusb.c b/drivers/char/xillybus/xillyusb.c index 33ca0f4af390..e12d359194f8 100644 --- a/drivers/char/xillybus/xillyusb.c +++ b/drivers/char/xillybus/xillyusb.c @@ -2093,9 +2093,11 @@ static int xillyusb_discovery(struct usb_interface *interface) * just after responding with the IDT, there is no reason for any * work item to be running now. To be sure that xdev->channels * is updated on anything that might run in parallel, flush the - * workqueue, which rarely does anything. + * device's workqueue and the wakeup work item. This rarely + * does anything. */ flush_workqueue(xdev->workq); + flush_work(&xdev->wakeup_workitem); xdev->num_channels = num_channels; @@ -2274,9 +2276,9 @@ static int __init xillyusb_init(void) static void __exit xillyusb_exit(void) { - destroy_workqueue(wakeup_wq); - usb_deregister(&xillyusb_driver); + + destroy_workqueue(wakeup_wq); } module_init(xillyusb_init); From 2374bf7558de915edc6ec8cb10ec3291dfab9594 Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Fri, 16 Aug 2024 10:02:00 +0300 Subject: [PATCH 0963/1052] char: xillybus: Check USB endpoints when probing device Ensure, as the driver probes the device, that all endpoints that the driver may attempt to access exist and are of the correct type. All XillyUSB devices must have a Bulk IN and Bulk OUT endpoint at address 1. This is verified in xillyusb_setup_base_eps(). On top of that, a XillyUSB device may have additional Bulk OUT endpoints. The information about these endpoints' addresses is deduced from a data structure (the IDT) that the driver fetches from the device while probing it. These endpoints are checked in setup_channels(). A XillyUSB device never has more than one IN endpoint, as all data towards the host is multiplexed in this single Bulk IN endpoint. This is why setup_channels() only checks OUT endpoints. Reported-by: syzbot+eac39cba052f2e750dbe@syzkaller.appspotmail.com Cc: stable Closes: https://lore.kernel.org/all/0000000000001d44a6061f7a54ee@google.com/T/ Fixes: a53d1202aef1 ("char: xillybus: Add driver for XillyUSB (Xillybus variant for USB)"). Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20240816070200.50695-2-eli.billauer@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillyusb.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/char/xillybus/xillyusb.c b/drivers/char/xillybus/xillyusb.c index e12d359194f8..45771b1a3716 100644 --- a/drivers/char/xillybus/xillyusb.c +++ b/drivers/char/xillybus/xillyusb.c @@ -1903,6 +1903,13 @@ static const struct file_operations xillyusb_fops = { static int xillyusb_setup_base_eps(struct xillyusb_dev *xdev) { + struct usb_device *udev = xdev->udev; + + /* Verify that device has the two fundamental bulk in/out endpoints */ + if (usb_pipe_type_check(udev, usb_sndbulkpipe(udev, MSG_EP_NUM)) || + usb_pipe_type_check(udev, usb_rcvbulkpipe(udev, IN_EP_NUM))) + return -ENODEV; + xdev->msg_ep = endpoint_alloc(xdev, MSG_EP_NUM | USB_DIR_OUT, bulk_out_work, 1, 2); if (!xdev->msg_ep) @@ -1932,14 +1939,15 @@ static int setup_channels(struct xillyusb_dev *xdev, __le16 *chandesc, int num_channels) { - struct xillyusb_channel *chan; + struct usb_device *udev = xdev->udev; + struct xillyusb_channel *chan, *new_channels; int i; chan = kcalloc(num_channels, sizeof(*chan), GFP_KERNEL); if (!chan) return -ENOMEM; - xdev->channels = chan; + new_channels = chan; for (i = 0; i < num_channels; i++, chan++) { unsigned int in_desc = le16_to_cpu(*chandesc++); @@ -1968,6 +1976,15 @@ static int setup_channels(struct xillyusb_dev *xdev, */ if ((out_desc & 0x80) && i < 14) { /* Entry is valid */ + if (usb_pipe_type_check(udev, + usb_sndbulkpipe(udev, i + 2))) { + dev_err(xdev->dev, + "Missing BULK OUT endpoint %d\n", + i + 2); + kfree(new_channels); + return -ENODEV; + } + chan->writable = 1; chan->out_synchronous = !!(out_desc & 0x40); chan->out_seekable = !!(out_desc & 0x20); @@ -1977,6 +1994,7 @@ static int setup_channels(struct xillyusb_dev *xdev, } } + xdev->channels = new_channels; return 0; } From b9b6ee6fe258ce4d89592593efcd3d798c418859 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:25:19 +0200 Subject: [PATCH 0964/1052] thermal: gov_bang_bang: Call __thermal_cdev_update() directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of clearing the "updated" flag for each cooling device affected by the trip point crossing in bang_bang_control() and walking all thermal instances to run thermal_cdev_update() for all of the affected cooling devices, call __thermal_cdev_update() directly for each of them. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Cc: 6.10+ # 6.10+ Link: https://patch.msgid.link/13583081.uLZWGnKmhe@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index 4a2e869b9538..b9474c6af72b 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -71,12 +71,9 @@ static void bang_bang_control(struct thermal_zone_device *tz, dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); mutex_lock(&instance->cdev->lock); - instance->cdev->updated = false; /* cdev needs update */ + __thermal_cdev_update(instance->cdev); mutex_unlock(&instance->cdev->lock); } - - list_for_each_entry(instance, &tz->thermal_instances, tz_node) - thermal_cdev_update(instance->cdev); } static struct thermal_governor thermal_gov_bang_bang = { From 84248e35d9b60e03df7276627e4e91fbaf80f73d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:26:42 +0200 Subject: [PATCH 0965/1052] thermal: gov_bang_bang: Split bang_bang_control() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the setting of the thermal instance target state from bang_bang_control() into a separate function that will be also called in a different place going forward. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Cc: 6.10+ # 6.10+ Link: https://patch.msgid.link/3313587.aeNJFYEL58@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 42 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index b9474c6af72b..87cff3ea77a9 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -13,6 +13,27 @@ #include "thermal_core.h" +static void bang_bang_set_instance_target(struct thermal_instance *instance, + unsigned int target) +{ + if (instance->target != 0 && instance->target != 1 && + instance->target != THERMAL_NO_TARGET) + pr_debug("Unexpected state %ld of thermal instance %s in bang-bang\n", + instance->target, instance->name); + + /* + * Enable the fan when the trip is crossed on the way up and disable it + * when the trip is crossed on the way down. + */ + instance->target = target; + + dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); + + mutex_lock(&instance->cdev->lock); + __thermal_cdev_update(instance->cdev); + mutex_unlock(&instance->cdev->lock); +} + /** * bang_bang_control - controls devices associated with the given zone * @tz: thermal_zone_device @@ -54,25 +75,8 @@ static void bang_bang_control(struct thermal_zone_device *tz, tz->temperature, trip->hysteresis); list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - if (instance->trip != trip) - continue; - - if (instance->target != 0 && instance->target != 1 && - instance->target != THERMAL_NO_TARGET) - pr_debug("Unexpected state %ld of thermal instance %s in bang-bang\n", - instance->target, instance->name); - - /* - * Enable the fan when the trip is crossed on the way up and - * disable it when the trip is crossed on the way down. - */ - instance->target = crossed_up; - - dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); - - mutex_lock(&instance->cdev->lock); - __thermal_cdev_update(instance->cdev); - mutex_unlock(&instance->cdev->lock); + if (instance->trip == trip) + bang_bang_set_instance_target(instance, crossed_up); } } From 5f64b4a1ab1b0412446d42e1fc2964c2cdb60b27 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:27:33 +0200 Subject: [PATCH 0966/1052] thermal: gov_bang_bang: Add .manage() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After recent changes, the Bang-bang governor may not adjust the initial configuration of cooling devices to the actual situation. Namely, if a cooling device bound to a certain trip point starts in the "on" state and the thermal zone temperature is below the threshold of that trip point, the trip point may never be crossed on the way up in which case the state of the cooling device will never be adjusted because the thermal core will never invoke the governor's .trip_crossed() callback. [Note that there is no issue if the zone temperature is at the trip threshold or above it to start with because .trip_crossed() will be invoked then to indicate the start of thermal mitigation for the given trip.] To address this, add a .manage() callback to the Bang-bang governor and use it to ensure that all of the thermal instances managed by the governor have been initialized properly and the states of all of the cooling devices involved have been adjusted to the current zone temperature as appropriate. Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed() instead of .throttle()") Link: https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/ Cc: 6.10+ # 6.10+ Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Link: https://patch.msgid.link/8419356.T7Z3S40VBb@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index 87cff3ea77a9..bc55e0698bfa 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -26,6 +26,7 @@ static void bang_bang_set_instance_target(struct thermal_instance *instance, * when the trip is crossed on the way down. */ instance->target = target; + instance->initialized = true; dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); @@ -80,8 +81,37 @@ static void bang_bang_control(struct thermal_zone_device *tz, } } +static void bang_bang_manage(struct thermal_zone_device *tz) +{ + const struct thermal_trip_desc *td; + struct thermal_instance *instance; + + for_each_trip_desc(tz, td) { + const struct thermal_trip *trip = &td->trip; + + if (tz->temperature >= td->threshold || + trip->temperature == THERMAL_TEMP_INVALID || + trip->type == THERMAL_TRIP_CRITICAL || + trip->type == THERMAL_TRIP_HOT) + continue; + + /* + * If the initial cooling device state is "on", but the zone + * temperature is not above the trip point, the core will not + * call bang_bang_control() until the zone temperature reaches + * the trip point temperature which may be never. In those + * cases, set the initial state of the cooling device to 0. + */ + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if (!instance->initialized && instance->trip == trip) + bang_bang_set_instance_target(instance, 0); + } + } +} + static struct thermal_governor thermal_gov_bang_bang = { .name = "bang_bang", .trip_crossed = bang_bang_control, + .manage = bang_bang_manage, }; THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang); From 6e6f58a170ea98e44075b761f2da42a5aec47dfb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:29:11 +0200 Subject: [PATCH 0967/1052] thermal: gov_bang_bang: Use governor_data to reduce overhead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After running once, the for_each_trip_desc() loop in bang_bang_manage() is pure needless overhead because it is not going to make any changes unless a new cooling device has been bound to one of the trips in the thermal zone or the system is resuming from sleep. For this reason, make bang_bang_manage() set governor_data for the thermal zone and check it upfront to decide whether or not it needs to do anything. However, governor_data needs to be reset in some cases to let bang_bang_manage() know that it should walk the trips again, so add an .update_tz() callback to the governor and make the core additionally invoke it during system resume. To avoid affecting the other users of that callback unnecessarily, add a special notification reason for system resume, THERMAL_TZ_RESUME, and also pass it to __thermal_zone_device_update() called during system resume for consistency. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Cc: 6.10+ # 6.10+ Link: https://patch.msgid.link/2285575.iZASKD2KPV@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 18 ++++++++++++++++++ drivers/thermal/thermal_core.c | 3 ++- include/linux/thermal.h | 1 + 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index bc55e0698bfa..daed67d19efb 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -86,6 +86,10 @@ static void bang_bang_manage(struct thermal_zone_device *tz) const struct thermal_trip_desc *td; struct thermal_instance *instance; + /* If the code below has run already, nothing needs to be done. */ + if (tz->governor_data) + return; + for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; @@ -107,11 +111,25 @@ static void bang_bang_manage(struct thermal_zone_device *tz) bang_bang_set_instance_target(instance, 0); } } + + tz->governor_data = (void *)true; +} + +static void bang_bang_update_tz(struct thermal_zone_device *tz, + enum thermal_notify_event reason) +{ + /* + * Let bang_bang_manage() know that it needs to walk trips after binding + * a new cdev and after system resume. + */ + if (reason == THERMAL_TZ_BIND_CDEV || reason == THERMAL_TZ_RESUME) + tz->governor_data = NULL; } static struct thermal_governor thermal_gov_bang_bang = { .name = "bang_bang", .trip_crossed = bang_bang_control, .manage = bang_bang_manage, + .update_tz = bang_bang_update_tz, }; THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang); diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 95c399f94744..e6669aeda1ff 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1728,7 +1728,8 @@ static void thermal_zone_device_resume(struct work_struct *work) thermal_debug_tz_resume(tz); thermal_zone_device_init(tz); - __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); + thermal_governor_update_tz(tz, THERMAL_TZ_RESUME); + __thermal_zone_device_update(tz, THERMAL_TZ_RESUME); complete(&tz->resume); tz->resuming = false; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 25fbf960b474..b86ddca46b9e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -55,6 +55,7 @@ enum thermal_notify_event { THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */ THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */ THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */ + THERMAL_TZ_RESUME, /* Thermal zone is resuming after system sleep */ }; /** From c2f6e16a6771eaefba6bb35f6803fe7217822d41 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 Aug 2024 13:02:55 -0400 Subject: [PATCH 0968/1052] bcachefs: Increase size of cuckoo hash table on too many rehashes Also, improve the calculation of the new table size, so that it can shrink when needed. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets_waiting_for_journal.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index ec1b636ef78d..f70eb2127d32 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -93,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, .dev_bucket = (u64) dev << 56 | bucket, .journal_seq = journal_seq, }; - size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; int ret = 0; mutex_lock(&b->lock); @@ -106,7 +106,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, for (i = 0; i < size; i++) nr_elements += t->d[i].journal_seq > flushed_seq; - new_bits = t->bits + (nr_elements * 3 > size); + new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { @@ -115,7 +115,14 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, } retry_rehash: + if (nr_rehashes_this_size == 3) { + new_bits++; + nr_rehashes_this_size = 0; + } + nr_rehashes++; + nr_rehashes_this_size++; + bucket_table_init(n, new_bits); tmp = new; From 075cabf324c3fd790d6ba39ff9db33a30b954fe2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2024 12:31:29 -0400 Subject: [PATCH 0969/1052] bcachefs: Fix forgetting to pass trans to fsck_err() Reported-by: syzbot+e3938cd6d761b78750e6@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b69ef4b3de6e..ad517ef744e5 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -925,7 +925,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", From 9482f3b05332a624508a91c2ab2cf3527328a6a4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2024 12:41:46 -0400 Subject: [PATCH 0970/1052] bcachefs: avoid overflowing LRU_TIME_BITS for cached data lru Reported-by: syzbot+510b0b28f8e6de64d307@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 260e7fa83d05..fd790b03fbe1 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -150,7 +150,9 @@ static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_typ static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { - return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; + return a.data_type == BCH_DATA_cached + ? a.io_time[READ] & LRU_TIME_MAX + : 0; } #define DATA_TYPES_MOVABLE \ From 99c87fe0f584f8d778a323141504d1ba5c89a4a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2024 12:44:49 -0400 Subject: [PATCH 0971/1052] bcachefs: fix incorrect i_state usage Reported-by: syzbot+95e40eae71609e40d851@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 15fc41e63b6c..94c392abef65 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -193,7 +193,7 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino * only insert fully created inodes in the inode hash table. But * discard_new_inode() expects it to be set... */ - inode->v.i_flags |= I_NEW; + inode->v.i_state |= I_NEW; /* * We don't want bch2_evict_inode() to delete the inode on disk, * we just raced and had another inode in cache. Normally new From 1fc2ac428ef7d2ab9e8e19efe7ec3e58aea51bf3 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 16 Aug 2024 12:15:23 -0600 Subject: [PATCH 0972/1052] io_uring: fix user_data field name in comment io_uring_cqe's user_data field refers to `sqe->data`, but io_uring_sqe does not have a data field. Fix the comment to say `sqe->user_data`. Signed-off-by: Caleb Sander Mateos Link: https://github.com/axboe/liburing/pull/1206 Link: https://lore.kernel.org/r/20240816181526.3642732-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2aaf7ee256ac..adc2524fd8e3 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -421,7 +421,7 @@ enum io_uring_msg_ring_flags { * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { - __u64 user_data; /* sqe->data submission passed back */ + __u64 user_data; /* sqe->user_data value passed back */ __s32 res; /* result code for this event */ __u32 flags; From 534f7eff9239c1b0af852fc33f5af2b62c00eddf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Aug 2024 10:40:38 +0930 Subject: [PATCH 0973/1052] btrfs: only enable extent map shrinker for DEBUG builds Although there are several patches improving the extent map shrinker, there are still reports of too frequent shrinker behavior, taking too much CPU for the kswapd process. So let's only enable extent shrinker for now, until we got more comprehensive understanding and a better solution. Link: https://lore.kernel.org/linux-btrfs/3df4acd616a07ef4d2dc6bad668701504b412ffc.camel@intelfx.name/ Link: https://lore.kernel.org/linux-btrfs/c30fd6b3-ca7a-4759-8a53-d42878bf84f7@gmail.com/ Fixes: 956a17d9d050 ("btrfs: add a shrinker for extent maps") CC: stable@vger.kernel.org # 6.10+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 11044e9e2cb1..98fa0f382480 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2402,7 +2402,13 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro trace_btrfs_extent_map_shrinker_count(fs_info, nr); - return nr; + /* + * Only report the real number for DEBUG builds, as there are reports of + * serious performance degradation caused by too frequent shrinks. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) + return nr; + return 0; } static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) From 0e49d3ff12501adaafaf6fdb19699f021d1eda1c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 4 May 2024 23:48:58 -0400 Subject: [PATCH 0974/1052] bcachefs: Fix locking in __bch2_trans_mark_dev_sb() We run this in full RW mode now, so we have to guard against the superblock buffer being reallocated. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 5 +---- fs/bcachefs/buckets.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6cbf2aa6a947..eb3002c4eae7 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -741,12 +741,9 @@ static int bch2_gc_btrees(struct bch_fs *c) static int bch2_mark_superblocks(struct bch_fs *c) { - mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_sb)); - int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); - mutex_unlock(&c->sb_lock); - return ret; + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); } static void bch2_gc_free(struct bch_fs *c) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ad517ef744e5..be2bbd248631 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -915,7 +915,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, enum bch_data_type type, unsigned sectors) { - struct bch_fs *c = trans->c; struct btree_iter iter; int ret = 0; @@ -1046,13 +1045,18 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, enum btree_iter_update_trigger_flags flags) { - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + struct bch_fs *c = trans->c; + + mutex_lock(&c->sb_lock); + struct bch_sb_layout layout = ca->disk_sb.sb->layout; + mutex_unlock(&c->sb_lock); + u64 bucket = 0; unsigned i, bucket_sectors = 0; int ret; - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); + for (i = 0; i < layout.nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout.sb_offset[i]); if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, @@ -1063,7 +1067,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, - offset + (1 << layout->sb_max_size_bits), + offset + (1 << layout.sb_max_size_bits), BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; From 3c0da3d163eb32f1f91891efaade027fa9b245b9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 6 Aug 2024 21:51:42 +0200 Subject: [PATCH 0975/1052] fuse: Initialize beyond-EOF page contents before setting uptodate fuse_notify_store(), unlike fuse_do_readpage(), does not enable page zeroing (because it can be used to change partial page contents). So fuse_notify_store() must be more careful to fully initialize page contents (including parts of the page that are beyond end-of-file) before marking the page uptodate. The current code can leave beyond-EOF page contents uninitialized, which makes these uninitialized page contents visible to userspace via mmap(). This is an information leak, but only affects systems which do not enable init-on-alloc (via CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y or the corresponding kernel command line parameter). Link: https://bugs.chromium.org/p/project-zero/issues/detail?id=2574 Cc: stable@kernel.org Fixes: a1d75f258230 ("fuse: add store request") Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9eb191b5c4de..7146038b2fe7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1618,9 +1618,11 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, this_num = min_t(unsigned, num, PAGE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) + if (!PageUptodate(page) && !err && offset == 0 && + (this_num == PAGE_SIZE || file_size == end)) { + zero_user_segment(page, this_num, PAGE_SIZE); SetPageUptodate(page); + } unlock_page(page); put_page(page); From 47ac09b91befbb6a235ab620c32af719f8208399 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 18 Aug 2024 13:17:27 -0700 Subject: [PATCH 0976/1052] Linux 6.11-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5f417808213f..68ebd6d6b444 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From dfd046d0ced19b6ff5f11ec4ceab0a83de924771 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 15 Aug 2024 08:56:35 +0900 Subject: [PATCH 0977/1052] ksmbd: Use unsafe_memcpy() for ntlm_negotiate rsp buffer is allocated larger than spnego_blob from smb2_allocate_rsp_buf(). Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 2df1354288e6..3f4c56a10a86 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1370,7 +1370,8 @@ static int ntlm_negotiate(struct ksmbd_work *work, } sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); out: @@ -1453,7 +1454,9 @@ static int ntlm_authenticate(struct ksmbd_work *work, return -ENOMEM; sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, + spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); } From 76e98a158b207771a6c9a0de0a60522a446a3447 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 17 Aug 2024 14:03:49 +0900 Subject: [PATCH 0978/1052] ksmbd: fix race condition between destroy_previous_session() and smb2 operations() If there is ->PreviousSessionId field in the session setup request, The session of the previous connection should be destroyed. During this, if the smb2 operation requests in the previous session are being processed, a racy issue could happen with ksmbd_destroy_file_table(). This patch sets conn->status to KSMBD_SESS_NEED_RECONNECT to block incoming operations and waits until on-going operations are complete (i.e. idle) before desctorying the previous session. Fixes: c8efcc786146 ("ksmbd: add support for durable handles v1/v2") Cc: stable@vger.kernel.org # v6.6+ Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-25040 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 34 ++++++++++++++++++++++++++++++- fs/smb/server/connection.h | 3 ++- fs/smb/server/mgmt/user_session.c | 9 ++++++++ fs/smb/server/smb2pdu.c | 2 +- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 09e1e7771592..7889df8112b4 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -165,11 +165,43 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status) up_read(&conn_list_lock); } -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn) { wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); } +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id) +{ + struct ksmbd_conn *conn; + int rc, retry_count = 0, max_timeout = 120; + int rcount = 1; + +retry_idle: + if (retry_count >= max_timeout) + return -EIO; + + down_read(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + if (conn->binding || xa_load(&conn->sessions, sess_id)) { + if (conn == curr_conn) + rcount = 2; + if (atomic_read(&conn->req_running) >= rcount) { + rc = wait_event_timeout(conn->req_running_q, + atomic_read(&conn->req_running) < rcount, + HZ); + if (!rc) { + up_read(&conn_list_lock); + retry_count++; + goto retry_idle; + } + } + } + } + up_read(&conn_list_lock); + + return 0; +} + int ksmbd_conn_write(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 5c2845e47cf2..5b947175c048 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -145,7 +145,8 @@ extern struct list_head conn_list; extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id); +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 162a12685d2c..99416ce9f501 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -311,6 +311,7 @@ void destroy_previous_session(struct ksmbd_conn *conn, { struct ksmbd_session *prev_sess; struct ksmbd_user *prev_user; + int err; down_write(&sessions_table_lock); down_write(&conn->session_lock); @@ -325,8 +326,16 @@ void destroy_previous_session(struct ksmbd_conn *conn, memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) goto out; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT); + err = ksmbd_conn_wait_idle_sess_id(conn, id); + if (err) { + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + goto out; + } + ksmbd_destroy_file_table(&prev_sess->file_table); prev_sess->state = SMB2_SESSION_EXPIRED; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); ksmbd_launch_ksmbd_durable_scavenger(); out: up_write(&conn->session_lock); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 3f4c56a10a86..cb7f487c96af 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2213,7 +2213,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_conn_unlock(conn); ksmbd_close_session_fds(work); - ksmbd_conn_wait_idle(conn, sess_id); + ksmbd_conn_wait_idle(conn); /* * Re-lookup session to validate if session is deleted From 4fdd8664c8a94411a01d11d5ed2f083f105f570a Mon Sep 17 00:00:00 2001 From: Victor Timofei Date: Fri, 16 Aug 2024 22:24:52 +0300 Subject: [PATCH 0979/1052] ksmbd: fix spelling mistakes in documentation There are a couple of spelling mistakes in the documentation. This patch fixes them. Signed-off-by: Victor Timofei Acked-by: Namjae Jeon Signed-off-by: Steve French --- Documentation/filesystems/smb/ksmbd.rst | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/filesystems/smb/ksmbd.rst b/Documentation/filesystems/smb/ksmbd.rst index 6b30e43a0d11..67cb68ea6e68 100644 --- a/Documentation/filesystems/smb/ksmbd.rst +++ b/Documentation/filesystems/smb/ksmbd.rst @@ -13,7 +13,7 @@ KSMBD architecture The subset of performance related operations belong in kernelspace and the other subset which belong to operations which are not really related with performance in userspace. So, DCE/RPC management that has historically resulted -into number of buffer overflow issues and dangerous security bugs and user +into a number of buffer overflow issues and dangerous security bugs and user account management are implemented in user space as ksmbd.mountd. File operations that are related with performance (open/read/write/close etc.) in kernel space (ksmbd). This also allows for easier integration with VFS @@ -24,8 +24,8 @@ ksmbd (kernel daemon) When the server daemon is started, It starts up a forker thread (ksmbd/interface name) at initialization time and open a dedicated port 445 -for listening to SMB requests. Whenever new clients make request, Forker -thread will accept the client connection and fork a new thread for dedicated +for listening to SMB requests. Whenever new clients make a request, the Forker +thread will accept the client connection and fork a new thread for a dedicated communication channel between the client and the server. It allows for parallel processing of SMB requests(commands) from clients as well as allowing for new clients to make new connections. Each instance is named ksmbd/1~n(port number) @@ -34,12 +34,12 @@ thread can decide to pass through the commands to the user space (ksmbd.mountd), currently DCE/RPC commands are identified to be handled through the user space. To further utilize the linux kernel, it has been chosen to process the commands as workitems and to be executed in the handlers of the ksmbd-io kworker threads. -It allows for multiplexing of the handlers as the kernel take care of initiating +It allows for multiplexing of the handlers as the kernel takes care of initiating extra worker threads if the load is increased and vice versa, if the load is -decreased it destroys the extra worker threads. So, after connection is -established with client. Dedicated ksmbd/1..n(port number) takes complete +decreased it destroys the extra worker threads. So, after the connection is +established with the client. Dedicated ksmbd/1..n(port number) takes complete ownership of receiving/parsing of SMB commands. Each received command is worked -in parallel i.e., There can be multiple clients commands which are worked in +in parallel i.e., there can be multiple client commands which are worked in parallel. After receiving each command a separated kernel workitem is prepared for each command which is further queued to be handled by ksmbd-io kworkers. So, each SMB workitem is queued to the kworkers. This allows the benefit of load @@ -49,9 +49,9 @@ performance by handling client commands in parallel. ksmbd.mountd (user space daemon) -------------------------------- -ksmbd.mountd is userspace process to, transfer user account and password that +ksmbd.mountd is a userspace process to, transfer the user account and password that are registered using ksmbd.adduser (part of utils for user space). Further it -allows sharing information parameters that parsed from smb.conf to ksmbd in +allows sharing information parameters that are parsed from smb.conf to ksmbd in kernel. For the execution part it has a daemon which is continuously running and connected to the kernel interface using netlink socket, it waits for the requests (dcerpc and share/user info). It handles RPC calls (at a minimum few @@ -124,7 +124,7 @@ How to run 1. Download ksmbd-tools(https://github.com/cifsd-team/ksmbd-tools/releases) and compile them. - - Refer README(https://github.com/cifsd-team/ksmbd-tools/blob/master/README.md) + - Refer to README(https://github.com/cifsd-team/ksmbd-tools/blob/master/README.md) to know how to use ksmbd.mountd/adduser/addshare/control utils $ ./autogen.sh @@ -133,7 +133,7 @@ How to run 2. Create /usr/local/etc/ksmbd/ksmbd.conf file, add SMB share in ksmbd.conf file. - - Refer ksmbd.conf.example in ksmbd-utils, See ksmbd.conf manpage + - Refer to ksmbd.conf.example in ksmbd-utils, See ksmbd.conf manpage for details to configure shares. $ man ksmbd.conf @@ -145,7 +145,7 @@ How to run $ man ksmbd.adduser $ sudo ksmbd.adduser -a -4. Insert ksmbd.ko module after build your kernel. No need to load module +4. Insert the ksmbd.ko module after you build your kernel. No need to load the module if ksmbd is built into the kernel. - Set ksmbd in menuconfig(e.g. $ make menuconfig) @@ -175,7 +175,7 @@ Each layer 1. Enable all component prints # sudo ksmbd.control -d "all" -2. Enable one of components (smb, auth, vfs, oplock, ipc, conn, rdma) +2. Enable one of the components (smb, auth, vfs, oplock, ipc, conn, rdma) # sudo ksmbd.control -d "smb" 3. Show what prints are enabled. From 7c525dddbee71880e654ad44f3917787a4f6042c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 16 Aug 2024 19:33:39 +0200 Subject: [PATCH 0980/1052] ksmbd: Replace one-element arrays with flexible-array members Replace the deprecated one-element arrays with flexible-array members in the structs filesystem_attribute_info and filesystem_device_info. There are no binary differences after this conversion. Link: https://github.com/KSPP/linux/issues/79 Signed-off-by: Thorsten Blum Reviewed-by: Gustavo A. R. Silva Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 4 ++-- fs/smb/server/smb_common.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index cb7f487c96af..0bc9edf22ba4 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5360,7 +5360,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, "NTFS", PATH_MAX, conn->local_nls, 0); len = len * 2; info->FileSystemNameLen = cpu_to_le32(len); - sz = sizeof(struct filesystem_attribute_info) - 2 + len; + sz = sizeof(struct filesystem_attribute_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } @@ -5386,7 +5386,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, len = len * 2; info->VolumeLabelSize = cpu_to_le32(len); info->Reserved = 0; - sz = sizeof(struct filesystem_vol_info) - 2 + len; + sz = sizeof(struct filesystem_vol_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index 4a3148b0167f..cc1d6dfe29d5 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -213,7 +213,7 @@ struct filesystem_attribute_info { __le32 Attributes; __le32 MaxPathNameComponentLength; __le32 FileSystemNameLen; - __le16 FileSystemName[1]; /* do not have to save this - get subset? */ + __le16 FileSystemName[]; /* do not have to save this - get subset? */ } __packed; struct filesystem_device_info { @@ -226,7 +226,7 @@ struct filesystem_vol_info { __le32 SerialNumber; __le32 VolumeLabelSize; __le16 Reserved; - __le16 VolumeLabel[1]; + __le16 VolumeLabel[]; } __packed; struct filesystem_info { From 5b5c96c63d5b6e91c622611e04b2b156bbae53f5 Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Thu, 1 Aug 2024 19:26:22 +0800 Subject: [PATCH 0981/1052] erofs: simplify readdir operation - Use i_size instead of i_size_read() due to immutable fses; - Get rid of an unneeded goto since erofs_fill_dentries() also works; - Remove unnecessary lines. Signed-off-by: Hongzhen Luo Link: https://lore.kernel.org/r/20240801112622.2164029-1-hongzhen@linux.alibaba.com Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/dir.c | 35 ++++++++++++----------------------- fs/erofs/internal.h | 2 +- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2193a6710c8f..c3b90abdee37 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -8,19 +8,15 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, - unsigned int nameoff, unsigned int maxsize) + unsigned int nameoff0, unsigned int maxsize) { - const struct erofs_dirent *end = dentry_blk + nameoff; + const struct erofs_dirent *end = dentry_blk + nameoff0; while (de < end) { - const char *de_name; + unsigned char d_type = fs_ftype_to_dtype(de->file_type); + unsigned int nameoff = le16_to_cpu(de->nameoff); + const char *de_name = (char *)dentry_blk + nameoff; unsigned int de_namelen; - unsigned char d_type; - - d_type = fs_ftype_to_dtype(de->file_type); - - nameoff = le16_to_cpu(de->nameoff); - de_name = (char *)dentry_blk + nameoff; /* the last dirent in the block? */ if (de + 1 >= end) @@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = dir->i_sb; unsigned long bsz = sb->s_blocksize; - const size_t dirsize = i_size_read(dir); - unsigned int i = erofs_blknr(sb, ctx->pos); unsigned int ofs = erofs_blkoff(sb, ctx->pos); int err = 0; bool initial = true; buf.mapping = dir->i_mapping; - while (ctx->pos < dirsize) { + while (ctx->pos < dir->i_size) { + erofs_off_t dbstart = ctx->pos - ofs; struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP); + de = erofs_bread(&buf, dbstart, EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", - i, EROFS_I(dir)->nid); + erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; } @@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) break; } - maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); - + maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; - ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = erofs_pos(sb, i) + ofs; - if (ofs >= nameoff) - goto skip_this; + ctx->pos = dbstart + ofs; } err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); if (err) break; -skip_this: - ctx->pos = erofs_pos(sb, i) + maxsize; - ++i; + ctx->pos = dbstart + maxsize; ofs = 0; } erofs_put_metabuf(&buf); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 736607675396..45dc15ebd870 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -220,7 +220,7 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) #define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) From 2c534624ae70100aeea0b5800b0f3768b2fd3cf0 Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Tue, 6 Aug 2024 19:22:08 +0800 Subject: [PATCH 0982/1052] erofs: get rid of check_layout_compatibility() Simple enough to just open-code it. Signed-off-by: Hongzhen Luo Reviewed-by: Sandeep Dhavale Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20240806112208.150323-1-hongzhen@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 32ce5b35e1df..6cb5c8916174 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -108,22 +108,6 @@ static void erofs_free_inode(struct inode *inode) kmem_cache_free(erofs_inode_cachep, vi); } -static bool check_layout_compatibility(struct super_block *sb, - struct erofs_super_block *dsb) -{ - const unsigned int feature = le32_to_cpu(dsb->feature_incompat); - - EROFS_SB(sb)->feature_incompat = feature; - - /* check if current kernel meets all mandatory requirements */ - if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", - feature & ~EROFS_ALL_FEATURE_INCOMPAT); - return false; - } - return true; -} - /* read variable-sized metadata, offset will be aligned by 4-byte */ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) @@ -279,7 +263,7 @@ static int erofs_scan_devices(struct super_block *sb, static int erofs_read_superblock(struct super_block *sb) { - struct erofs_sb_info *sbi; + struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; void *data; @@ -291,9 +275,7 @@ static int erofs_read_superblock(struct super_block *sb) return PTR_ERR(data); } - sbi = EROFS_SB(sb); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); - ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); @@ -318,8 +300,12 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - if (!check_layout_compatibility(sb, dsb)) + sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); + if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", + sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; + } sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { From e080a26725fb36f535f22ea42694c60ab005fb2e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 19 Aug 2024 10:52:07 +0800 Subject: [PATCH 0983/1052] erofs: allow large folios for compressed files As commit 2e6506e1c4ee ("mm/migrate: fix deadlock in migrate_pages_batch() on large folios") has landed upstream, large folios can be safely enabled for compressed inodes since all prerequisites have already landed in 6.11-rc1. Stress tests has been running on my fleet for over 20 days without any regression. Additionally, users [1] have requested it for months. Let's allow large folios for EROFS full cases upstream now for wider testing. [1] https://lore.kernel.org/r/CAGsJ_4wtE8OcpinuqVwG4jtdx6Qh5f+TON6wz+4HMCq=A2qFcA@mail.gmail.com Cc: Barry Song <21cnbao@gmail.com> Cc: Matthew Wilcox (Oracle) [ Gao Xiang: minor commit typo fixes. ] Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240819025207.3808649-1-hsiangkao@linux.alibaba.com --- Documentation/filesystems/erofs.rst | 2 +- fs/erofs/inode.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index cc4626d6ee4f..c293f8e37468 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -75,7 +75,7 @@ Here are the main features of EROFS: - Support merging tail-end data into a special inode as fragments. - - Support large folios for uncompressed files. + - Support large folios to make use of THPs (Transparent Hugepages); - Support direct I/O on uncompressed files to avoid double caching for loop devices; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 43c09aae2afc..419432be3223 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -257,25 +257,23 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } + mapping_set_large_folios(inode->i_mapping); if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, erofs_info, inode->i_sb, "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); inode->i_mapping->a_ops = &z_erofs_aops; - err = 0; - goto out_unlock; -#endif +#else err = -EOPNOTSUPP; - goto out_unlock; - } - inode->i_mapping->a_ops = &erofs_raw_access_aops; - mapping_set_large_folios(inode->i_mapping); -#ifdef CONFIG_EROFS_FS_ONDEMAND - if (erofs_is_fscache_mode(inode->i_sb)) - inode->i_mapping->a_ops = &erofs_fscache_access_aops; #endif - + } else { + inode->i_mapping->a_ops = &erofs_raw_access_aops; +#ifdef CONFIG_EROFS_FS_ONDEMAND + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif + } out_unlock: erofs_put_metabuf(&buf); return err; From cf1e515c9a40caa8bddb920970d3257bb01c1421 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 19 Aug 2024 20:00:07 +0800 Subject: [PATCH 0984/1052] iommufd/selftest: Make dirty_ops static The sparse tool complains as follows: drivers/iommu/iommufd/selftest.c:277:30: warning: symbol 'dirty_ops' was not declared. Should it be static? This symbol is not used outside of selftest.c, so marks it static. Fixes: 266ce58989ba ("iommufd/selftest: Test IOMMU_HWPT_ALLOC_DIRTY_TRACKING") Link: https://patch.msgid.link/r/20240819120007.3884868-1-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index f95e32e29133..222cfc11ebfd 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -273,7 +273,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, return 0; } -const struct iommu_dirty_ops dirty_ops = { +static const struct iommu_dirty_ops dirty_ops = { .set_dirty_tracking = mock_domain_set_dirty_tracking, .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; From 67666479edf1e2b732f4d0ac797885e859a78de4 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 19 Aug 2024 18:28:04 +0200 Subject: [PATCH 0985/1052] bpf: Enable generic kfuncs for BPF_CGROUP_* programs These kfuncs are enabled even in BPF_PROG_TYPE_TRACING, so they should be safe also in BPF_CGROUP_* programs. Since all BPF_CGROUP_* programs share the same hook, call register_btf_kfunc_id_set() only once. In enum btf_kfunc_hook, rename BTF_KFUNC_HOOK_CGROUP_SKB to a more generic BTF_KFUNC_HOOK_CGROUP, since it's used for all the cgroup related program types. Signed-off-by: Matteo Croce Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819162805.78235-2-technoboy85@gmail.com --- kernel/bpf/btf.c | 8 ++++++-- kernel/bpf/helpers.c | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bfb0d89ccc8b..b12db397303e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -212,7 +212,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_FMODRET, - BTF_KFUNC_HOOK_CGROUP_SKB, + BTF_KFUNC_HOOK_CGROUP, BTF_KFUNC_HOOK_SCHED_ACT, BTF_KFUNC_HOOK_SK_SKB, BTF_KFUNC_HOOK_SOCKET_FILTER, @@ -8307,8 +8307,12 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - return BTF_KFUNC_HOOK_CGROUP_SKB; + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; case BPF_PROG_TYPE_SK_SKB: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d02ae323996b..26b9649ab4ce 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3052,6 +3052,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); From 7f6287417baf57754f47687c6ea1a749a0686ab0 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 19 Aug 2024 18:28:05 +0200 Subject: [PATCH 0986/1052] bpf: Allow bpf_current_task_under_cgroup() with BPF_CGROUP_* The helper bpf_current_task_under_cgroup() currently is only allowed for tracing programs, allow its usage also in the BPF_CGROUP_* program types. Move the code from kernel/trace/bpf_trace.c to kernel/bpf/helpers.c, so it compiles also without CONFIG_BPF_EVENTS. This will be used in systemd-networkd to monitor the sysctl writes, and filter it's own writes from others: https://github.com/systemd/systemd/pull/32212 Signed-off-by: Matteo Croce Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819162805.78235-3-technoboy85@gmail.com --- include/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 2 ++ kernel/bpf/helpers.c | 23 +++++++++++++++++++++++ kernel/trace/bpf_trace.c | 27 ++------------------------- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b9425e410bcb..f0192c173ed8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3206,6 +3206,7 @@ extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; +extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8ba73042a239..e7113d700b87 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2581,6 +2581,8 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; default: return NULL; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 26b9649ab4ce..12e3aa40b180 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2458,6 +2458,29 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, return ret; } +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + /** * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d557bb11e0ff..b69a39316c0c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,29 +797,6 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = { .ret_btf_id = &bpf_task_pt_regs_ids[0], }; -BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -{ - struct bpf_array *array = container_of(map, struct bpf_array, map); - struct cgroup *cgrp; - - if (unlikely(idx >= array->map.max_entries)) - return -E2BIG; - - cgrp = READ_ONCE(array->ptrs[idx]); - if (unlikely(!cgrp)) - return -EAGAIN; - - return task_under_cgroup_hierarchy(current, cgrp); -} - -static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { - .func = bpf_current_task_under_cgroup, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; @@ -1480,8 +1457,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_write_user: @@ -1510,6 +1485,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; From 2aa93695081d4bd62350b41d58684d802be4563f Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 19 Aug 2024 16:11:27 +0100 Subject: [PATCH 0987/1052] selftests/bpf: Disable strict aliasing for verifier_nocsr.c verfifier_nocsr.c fails to compile in GCC. The reason behind it was initially explained in commit 27a90b14b93d3b2e1efd10764e456af7e2a42991. "A few BPF selftests perform type punning and they may break strict aliasing rules, which are exploited by both GCC and clang by default while optimizing. This can lead to broken compiled programs." Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819151129.1366484-2-cupertino.miranda@oracle.com --- tools/testing/selftests/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4eceb491a8ae..7bca9a2b8bc0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -58,6 +58,7 @@ progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing progs/timer_crash.c-CFLAGS := -fno-strict-aliasing progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing +progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing ifneq ($(LLVM),) # Silence some warnings when compiled with clang From d9075ac631ce0c5c2c17aa6221282a7c4ba8fa70 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 19 Aug 2024 16:11:29 +0100 Subject: [PATCH 0988/1052] selftest/bpf: Adapt inline asm operand constraint for GCC support GCC errors when compiling tailcall_bpf2bpf_hierarchy2.c and tailcall_bpf2bpf_hierarchy3.c with the following error: progs/tailcall_bpf2bpf_hierarchy2.c: In function 'tailcall_bpf2bpf_hierarchy_2': progs/tailcall_bpf2bpf_hierarchy2.c:66:9: error: input operand constraint contains '+' 66 | asm volatile (""::"r+"(ret)); | ^~~ Changed implementation to make use of __sink macro that abstracts the desired behaviour. The proposed change seems valid for both GCC and CLANG. Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819151129.1366484-4-cupertino.miranda@oracle.com --- .../testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c | 4 ++-- .../testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c index 37604b0b97af..72fd0d577506 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -58,12 +58,12 @@ __retval(33) SEC("tc") int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) { - volatile int ret = 0; + int ret = 0; subprog_tail0(skb); subprog_tail1(skb); - asm volatile (""::"r+"(ret)); + __sink(ret); return (count1 << 16) | count0; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c index 0cdbb781fcbc..a7fb91cb05b7 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -51,11 +51,11 @@ __retval(33) SEC("tc") int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb) { - volatile int ret = 0; + int ret = 0; bpf_tail_call_static(skb, &jmp_table0, 0); - asm volatile (""::"r+"(ret)); + __sink(ret); return ret; } From 46ee21e9f59205e54943dfe51b2dc8a9352ca37d Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 16 Aug 2024 09:36:26 -0700 Subject: [PATCH 0989/1052] platform/x86: ISST: Fix return value on last invalid resource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When only the last resource is invalid, tpmi_sst_dev_add() is returing error even if there are other valid resources before. This function should return error when there are no valid resources. Here tpmi_sst_dev_add() is returning "ret" variable. But this "ret" variable contains the failure status of last call to sst_main(), which failed for the invalid resource. But there may be other valid resources before the last entry. To address this, do not update "ret" variable for sst_main() return status. If there are no valid resources, it is already checked for by !inst below the loop and -ENODEV is returned. Fixes: 9d1d36268f3d ("platform/x86: ISST: Support partitioned systems") Signed-off-by: Srinivas Pandruvada Cc: stable@vger.kernel.org # 6.10+ Link: https://lore.kernel.org/r/20240816163626.415762-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c index 7fa360073f6e..404582307109 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c @@ -1549,8 +1549,7 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev) goto unlock_free; } - ret = sst_main(auxdev, &pd_info[i]); - if (ret) { + if (sst_main(auxdev, &pd_info[i])) { /* * This entry is not valid, hardware can partially * populate dies. In this case MMIO will have 0xFFs. From 0005e01e1e875c5e27130c5e2ed0189749d1e08a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 20 Aug 2024 16:56:19 +0800 Subject: [PATCH 0990/1052] erofs: fix out-of-bound access when z_erofs_gbuf_growsize() partially fails If z_erofs_gbuf_growsize() partially fails on a global buffer due to memory allocation failure or fault injection (as reported by syzbot [1]), new pages need to be freed by comparing to the existing pages to avoid memory leaks. However, the old gbuf->pages[] array may not be large enough, which can lead to null-ptr-deref or out-of-bound access. Fix this by checking against gbuf->nrpages in advance. [1] https://lore.kernel.org/r/000000000000f7b96e062018c6e3@google.com Reported-by: syzbot+242ee56aaa9585553766@syzkaller.appspotmail.com Fixes: d6db47e571dc ("erofs: do not use pagepool in z_erofs_gbuf_growsize()") Cc: # 6.10+ Reviewed-by: Chunhai Guo Reviewed-by: Sandeep Dhavale Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240820085619.1375963-1-hsiangkao@linux.alibaba.com --- fs/erofs/zutil.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 9b53883e5caf..37afe2024840 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -111,7 +111,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) out: if (i < z_erofs_gbuf_count && tmp_pages) { for (j = 0; j < nrpages; ++j) - if (tmp_pages[j] && tmp_pages[j] != gbuf->pages[j]) + if (tmp_pages[j] && (j >= gbuf->nrpages || + tmp_pages[j] != gbuf->pages[j])) __free_page(tmp_pages[j]); kfree(tmp_pages); } From 496ddd19a0fad22f250fc7a7b7a8000155418934 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 8 Aug 2024 16:22:28 -0700 Subject: [PATCH 0991/1052] bpf: extract iterator argument type and name validation logic Verifier enforces that all iterator structs are named `bpf_iter_` and that whenever iterator is passed to a kfunc it's passed as a valid PTR -> STRUCT chain (with potentially const modifiers in between). We'll need this check for upcoming changes, so instead of duplicating the logic, extract it into a helper function. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240808232230.2848712-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 50 ++++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index cffb43133c68..b8a583194c4a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -580,6 +580,7 @@ bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); +int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -654,6 +655,10 @@ static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, { return false; } +static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) +{ + return -EOPNOTSUPP; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b12db397303e..c9338fb397fc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8047,15 +8047,44 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE +/* Validate well-formedness of iter argument type. + * On success, return positive BTF ID of iter state's STRUCT type. + * On error, negative error is returned. + */ +int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) +{ + const struct btf_param *arg; + const struct btf_type *t; + const char *name; + int btf_id; + + if (btf_type_vlen(func) <= arg_idx) + return -EINVAL; + + arg = &btf_params(func)[arg_idx]; + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + t = btf_type_skip_modifiers(btf, t->type, &btf_id); + if (!t || !__btf_type_is_struct(t)) + return -EINVAL; + + name = btf_name_by_offset(btf, t->name_off); + if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) + return -EINVAL; + + return btf_id; +} + static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, const struct btf_type *func, u32 func_flags) { u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); - const char *name, *sfx, *iter_name; - const struct btf_param *arg; + const char *sfx, *iter_name; const struct btf_type *t; char exp_name[128]; u32 nr_args; + int btf_id; /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ if (!flags || (flags & (flags - 1))) @@ -8066,28 +8095,21 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, if (nr_args < 1) return -EINVAL; - arg = &btf_params(func)[0]; - t = btf_type_skip_modifiers(btf, arg->type, NULL); - if (!t || !btf_type_is_ptr(t)) - return -EINVAL; - t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!t || !__btf_type_is_struct(t)) - return -EINVAL; - - name = btf_name_by_offset(btf, t->name_off); - if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) - return -EINVAL; + btf_id = btf_check_iter_arg(btf, func, 0); + if (btf_id < 0) + return btf_id; /* sizeof(struct bpf_iter_) should be a multiple of 8 to * fit nicely in stack slots */ + t = btf_type_by_id(btf, btf_id); if (t->size == 0 || (t->size % 8)) return -EINVAL; /* validate bpf_iter__{new,next,destroy}(struct bpf_iter_ *) * naming pattern */ - iter_name = name + sizeof(ITER_PREFIX) - 1; + iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1; if (flags & KF_ITER_NEW) sfx = "new"; else if (flags & KF_ITER_NEXT) From baebe9aaba1e59e34cd1fe6455bb4c3029ad3bc1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 8 Aug 2024 16:22:29 -0700 Subject: [PATCH 0992/1052] bpf: allow passing struct bpf_iter_ as kfunc arguments There are potentially useful cases where a specific iterator type might need to be passed into some kfunc. So, in addition to existing bpf_iter__{new,next,destroy}() kfuncs, allow to pass iterator pointer to any kfunc. We employ "__iter" naming suffix for arguments that are meant to accept iterators. We also enforce that they accept PTR -> STRUCT btf_iter_ type chain and point to a valid initialized on-the-stack iterator state. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240808232230.2848712-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df3be12096cf..920d7c5fe944 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7970,12 +7970,17 @@ static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_ITER_DESTROY; } -static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg) +static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, + const struct btf_param *arg) { /* btf_check_iter_kfuncs() guarantees that first argument of any iter * kfunc is iter state pointer */ - return arg == 0 && is_iter_kfunc(meta); + if (is_iter_kfunc(meta)) + return arg_idx == 0; + + /* iter passed as an argument to a generic kfunc */ + return btf_param_match_suffix(meta->btf, arg, "__iter"); } static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, @@ -7983,14 +7988,20 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; const struct btf_type *t; - const struct btf_param *arg; - int spi, err, i, nr_slots; - u32 btf_id; + int spi, err, i, nr_slots, btf_id; - /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */ - arg = &btf_params(meta->func_proto)[0]; - t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */ - t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */ + /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs() + * ensures struct convention, so we wouldn't need to do any BTF + * validation here. But given iter state can be passed as a parameter + * to any kfunc, if arg has "__iter" suffix, we need to be a bit more + * conservative here. + */ + btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); + if (btf_id < 0) { + verbose(env, "expected valid iter pointer as arg #%d\n", regno); + return -EINVAL; + } + t = btf_type_by_id(meta->btf, btf_id); nr_slots = t->size / BPF_REG_SIZE; if (is_iter_new_kfunc(meta)) { @@ -8012,7 +8023,9 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id if (err) return err; } else { - /* iter_next() or iter_destroy() expect initialized iter state*/ + /* iter_next() or iter_destroy(), as well as any kfunc + * accepting iter argument, expect initialized iter state + */ err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots); switch (err) { case 0: @@ -11382,7 +11395,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_DYNPTR; - if (is_kfunc_arg_iter(meta, argno)) + if (is_kfunc_arg_iter(meta, argno, &args[argno])) return KF_ARG_PTR_TO_ITER; if (is_kfunc_arg_list_head(meta->btf, &args[argno])) From b0cd726f9a8279b33faa5b4b0011df14f331fb33 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 8 Aug 2024 16:22:30 -0700 Subject: [PATCH 0993/1052] selftests/bpf: test passing iterator to a kfunc Define BPF iterator "getter" kfunc, which accepts iterator pointer as one of the arguments. Make sure that argument passed doesn't have to be the very first argument (unlike new-next-destroy combo). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240808232230.2848712-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 16 ++++-- .../selftests/bpf/progs/iters_testmod_seq.c | 50 +++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 3687a40b61c6..c04b7dec2ab9 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -141,13 +141,12 @@ bpf_testmod_test_mod_kfunc(int i) __bpf_kfunc int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) { - if (cnt < 0) { - it->cnt = 0; + it->cnt = cnt; + + if (cnt < 0) return -EINVAL; - } it->value = value; - it->cnt = cnt; return 0; } @@ -162,6 +161,14 @@ __bpf_kfunc s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq* it) return &it->value; } +__bpf_kfunc s64 bpf_iter_testmod_seq_value(int val, struct bpf_iter_testmod_seq* it__iter) +{ + if (it__iter->cnt < 0) + return 0; + + return val + it__iter->value; +} + __bpf_kfunc void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) { it->cnt = 0; @@ -531,6 +538,7 @@ BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value) BTF_ID_FLAGS(func, bpf_kfunc_common_test) BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 3873fb6c292a..4a176e6aede8 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -12,6 +12,7 @@ struct bpf_iter_testmod_seq { extern int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) __ksym; extern s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq *it) __ksym; +extern s64 bpf_iter_testmod_seq_value(int blah, struct bpf_iter_testmod_seq *it) __ksym; extern void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) __ksym; const volatile __s64 exp_empty = 0 + 1; @@ -76,4 +77,53 @@ int testmod_seq_truncated(const void *ctx) return 0; } +SEC("?raw_tp") +__failure +__msg("expected an initialized iter_testmod_seq as arg #2") +int testmod_seq_getter_before_bad(const void *ctx) +{ + struct bpf_iter_testmod_seq it; + + return bpf_iter_testmod_seq_value(0, &it); +} + +SEC("?raw_tp") +__failure +__msg("expected an initialized iter_testmod_seq as arg #2") +int testmod_seq_getter_after_bad(const void *ctx) +{ + struct bpf_iter_testmod_seq it; + s64 sum = 0, *v; + + bpf_iter_testmod_seq_new(&it, 100, 100); + + while ((v = bpf_iter_testmod_seq_next(&it))) { + sum += *v; + } + + bpf_iter_testmod_seq_destroy(&it); + + return sum + bpf_iter_testmod_seq_value(0, &it); +} + +SEC("?socket") +__success __retval(1000000) +int testmod_seq_getter_good(const void *ctx) +{ + struct bpf_iter_testmod_seq it; + s64 sum = 0, *v; + + bpf_iter_testmod_seq_new(&it, 100, 100); + + while ((v = bpf_iter_testmod_seq_next(&it))) { + sum += *v; + } + + sum *= bpf_iter_testmod_seq_value(0, &it); + + bpf_iter_testmod_seq_destroy(&it); + + return sum; +} + char _license[] SEC("license") = "GPL"; From 7d41dad105b6667745f0984e386caf3289cef6db Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:49 -0700 Subject: [PATCH 0994/1052] selftests/bpf: less spam in the log for message matching When running test_loader based tests in the verbose mode each matched message leaves a trace in the stderr, e.g.: ./test_progs -vvv -t ... validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec This is not very helpful when debugging such tests and clobbers the log a lot. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 12b0c41e8d64..1b1290e090e7 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -531,7 +531,8 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, } } - if (!ASSERT_OK_PTR(match, "expect_msg")) { + if (!match) { + PRINT_FAIL("expect_msg\n"); if (env.verbosity == VERBOSE_NONE) emit_fn(log_buf, true /*force*/); for (j = 0; j <= i; j++) { From d0a29cdb6ef95d8a175e09ab2d1334271f047e60 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:50 -0700 Subject: [PATCH 0995/1052] selftests/bpf: correctly move 'log' upon successful match Suppose log="foo bar buz" and msg->substr="bar". In such case current match processing logic would update 'log' as follows: log += strlen(msg->substr); -> log += 3 -> log=" bar". However, the intent behind the 'log' update is to make it point after the successful match, e.g. to make log=" buz" in the example above. Fixes: 4ef5d6af4935 ("selftests/bpf: no need to track next_match_pos in struct test_loader") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 1b1290e090e7..f5f5d16ac550 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -522,7 +522,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, if (msg->substr) { match = strstr(log, msg->substr); if (match) - log += strlen(msg->substr); + log = match + strlen(msg->substr); } else { err = regexec(&msg->regex, log, 1, reg_match, 0); if (err == 0) { From f00bb757ed630affc951691ddaff206039cbb7ee Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:51 -0700 Subject: [PATCH 0996/1052] selftests/bpf: fix to avoid __msg tag de-duplication by clang __msg, __regex and __xlated tags are based on __attribute__((btf_decl_tag("..."))) annotations. Clang de-duplicates such annotations, e.g. the following two sequences of tags are identical in final BTF: /* seq A */ /* seq B */ __tag("foo") __tag("foo") __tag("bar") __tag("bar") __tag("foo") Fix this by adding a unique suffix for each tag using __COUNTER__ pre-processor macro. E.g. here is a new definition for __msg: #define __msg(msg) \ __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) Using this definition the "seq A" from example above is translated to BTF as follows: [..] DECL_TAG 'comment:test_expect_msg=0=foo' type_id=X component_idx=-1 [..] DECL_TAG 'comment:test_expect_msg=1=bar' type_id=X component_idx=-1 [..] DECL_TAG 'comment:test_expect_msg=2=foo' type_id=X component_idx=-1 Surprisingly, this bug affects a single existing test: verifier_spill_fill/old_stack_misc_vs_cur_ctx_ptr, where sequence of identical messages was expected in the log. Fixes: 537c3f66eac1 ("selftests/bpf: add generic BPF program tester-loader") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 15 +++--- .../selftests/bpf/progs/verifier_spill_fill.c | 8 ++-- tools/testing/selftests/bpf/test_loader.c | 47 ++++++++++++++----- 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a225cd87897c..4f1029743734 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -2,6 +2,9 @@ #ifndef __BPF_MISC_H__ #define __BPF_MISC_H__ +#define XSTR(s) STR(s) +#define STR(s) #s + /* This set of attributes controls behavior of the * test_loader.c:test_loader__run_subtests(). * @@ -68,15 +71,15 @@ * Several __arch_* annotations could be specified at once. * When test case is not run on current arch it is marked as skipped. */ -#define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) -#define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" regex))) -#define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" msg))) +#define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) +#define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" XSTR(__COUNTER__) "=" regex))) +#define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) -#define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" msg))) -#define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" regex))) -#define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" msg))) +#define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" XSTR(__COUNTER__) "=" regex))) +#define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) #define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 9d288ec7a168..671d9f415dbf 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -1213,10 +1213,10 @@ __success __log_level(2) * - once for path entry - label 2; * - once for path entry - label 1 - label 2. */ -__msg("r1 = *(u64 *)(r10 -8)") -__msg("exit") -__msg("r1 = *(u64 *)(r10 -8)") -__msg("exit") +__msg("8: (79) r1 = *(u64 *)(r10 -8)") +__msg("9: (95) exit") +__msg("from 2 to 7") +__msg("8: safe") __msg("processed 11 insns") __flag(BPF_F_TEST_STATE_FREQ) __naked void old_stack_misc_vs_cur_ctx_ptr(void) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index f5f5d16ac550..c604a82e03c4 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -215,6 +215,35 @@ static void update_flags(int *flags, int flag, bool clear) *flags |= flag; } +/* Matches a string of form '[^=]=.*' and returns it's suffix. + * Used to parse btf_decl_tag values. + * Such values require unique prefix because compiler does not add + * same __attribute__((btf_decl_tag(...))) twice. + * Test suite uses two-component tags for such cases: + * + * __COUNTER__ '=' + * + * For example, two consecutive __msg tags '__msg("foo") __msg("foo")' + * would be encoded as: + * + * [18] DECL_TAG 'comment:test_expect_msg=0=foo' type_id=15 component_idx=-1 + * [19] DECL_TAG 'comment:test_expect_msg=1=foo' type_id=15 component_idx=-1 + * + * And the purpose of this function is to extract 'foo' from the above. + */ +static const char *skip_dynamic_pfx(const char *s, const char *pfx) +{ + const char *msg; + + if (strncmp(s, pfx, strlen(pfx)) != 0) + return NULL; + msg = s + strlen(pfx); + msg = strchr(msg, '='); + if (!msg) + return NULL; + return msg + 1; +} + enum arch { ARCH_X86_64 = 0x1, ARCH_ARM64 = 0x2, @@ -290,38 +319,32 @@ static int parse_test_spec(struct test_loader *tester, } else if (strcmp(s, TEST_TAG_AUXILIARY_UNPRIV) == 0) { spec->auxiliary = true; spec->mode_mask |= UNPRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) { - msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) { err = push_msg(msg, NULL, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV)) { - msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX_UNPRIV) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) { err = push_msg(msg, NULL, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX)) { - msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX))) { err = push_msg(NULL, msg, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV)) { - msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX_UNPRIV) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV))) { err = push_msg(NULL, msg, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX)) { - msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { err = push_msg(msg, NULL, &spec->priv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= PRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV)) { - msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX_UNPRIV) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) { err = push_msg(msg, NULL, &spec->unpriv.expect_xlated); if (err) goto cleanup; From f8d161756d422598e10a112171a73cf621e67fae Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:52 -0700 Subject: [PATCH 0997/1052] selftests/bpf: replace __regex macro with "{{...}}" patterns Upcoming changes require a notation to specify regular expression matches for regular verifier log messages, disassembly of BPF instructions, disassembly of jited instructions. Neither basic nor extended POSIX regular expressions w/o additional escaping are good for this role because of wide use of special characters in disassembly, for example: movq -0x10(%rbp), %rax ;; () are special characters cmpq $0x21, %rax ;; $ is a special character *(u64 *)(r10 -16) = r1 ;; * and () are special characters This commit borrows syntax from LLVM's FileCheck utility. It replaces __regex macro with ability to embed regular expressions in __msg patters using "{{" "}}" pairs for escaping. Syntax for __msg patterns: pattern := ( | regex)* regex := "{{" "}}" For example, pattern "foo{{[0-9]+}}" matches strings like "foo0", "foo007", etc. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 9 +- .../testing/selftests/bpf/progs/dynptr_fail.c | 6 +- .../testing/selftests/bpf/progs/rbtree_fail.c | 2 +- .../bpf/progs/refcounted_kptr_fail.c | 4 +- tools/testing/selftests/bpf/test_loader.c | 164 +++++++++++------- 5 files changed, 115 insertions(+), 70 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 4f1029743734..cc3ef20a6490 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -25,12 +25,15 @@ * * __msg Message expected to be found in the verifier log. * Multiple __msg attributes could be specified. + * To match a regular expression use "{{" "}}" brackets, + * e.g. "foo{{[0-9]+}}" matches strings like "foo007". + * Extended POSIX regular expression syntax is allowed + * inside the brackets. * __msg_unpriv Same as __msg but for unprivileged mode. * - * __regex Same as __msg, but using a regular expression. - * __regex_unpriv Same as __msg_unpriv but using a regular expression. * __xlated Expect a line in a disassembly log after verifier applies rewrites. * Multiple __xlated attributes could be specified. + * Regular expressions could be specified same way as in __msg. * __xlated_unpriv Same as __xlated but for unprivileged mode. * * __success Expect program load success in privileged mode. @@ -72,13 +75,11 @@ * When test case is not run on current arch it is marked as skipped. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) -#define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" XSTR(__COUNTER__) "=" regex))) #define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) -#define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" XSTR(__COUNTER__) "=" regex))) #define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index e35bc1eac52a..68b8c6eca508 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -964,7 +964,7 @@ int dynptr_invalidate_slice_reinit(void *ctx) * mem_or_null pointers. */ SEC("?raw_tp") -__failure __regex("R[0-9]+ type=scalar expected=percpu_ptr_") +__failure __msg("R{{[0-9]+}} type=scalar expected=percpu_ptr_") int dynptr_invalidate_slice_or_null(void *ctx) { struct bpf_dynptr ptr; @@ -982,7 +982,7 @@ int dynptr_invalidate_slice_or_null(void *ctx) /* Destruction of dynptr should also any slices obtained from it */ SEC("?raw_tp") -__failure __regex("R[0-9]+ invalid mem access 'scalar'") +__failure __msg("R{{[0-9]+}} invalid mem access 'scalar'") int dynptr_invalidate_slice_failure(void *ctx) { struct bpf_dynptr ptr1; @@ -1069,7 +1069,7 @@ int dynptr_read_into_slot(void *ctx) /* bpf_dynptr_slice()s are read-only and cannot be written to */ SEC("?tc") -__failure __regex("R[0-9]+ cannot write into rdonly_mem") +__failure __msg("R{{[0-9]+}} cannot write into rdonly_mem") int skb_invalid_slice_write(struct __sk_buff *skb) { struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index b722a1e1ddef..dbd5eee8e25e 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -105,7 +105,7 @@ long rbtree_api_remove_unadded_node(void *ctx) } SEC("?tc") -__failure __regex("Unreleased reference id=3 alloc_insn=[0-9]+") +__failure __msg("Unreleased reference id=3 alloc_insn={{[0-9]+}}") long rbtree_api_remove_no_drop(void *ctx) { struct bpf_rb_node *res; diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c index f8d4b7cfcd68..836c8ab7b908 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c @@ -32,7 +32,7 @@ static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b) } SEC("?tc") -__failure __regex("Unreleased reference id=4 alloc_insn=[0-9]+") +__failure __msg("Unreleased reference id=4 alloc_insn={{[0-9]+}}") long rbtree_refcounted_node_ref_escapes(void *ctx) { struct node_acquire *n, *m; @@ -73,7 +73,7 @@ long refcount_acquire_maybe_null(void *ctx) } SEC("?tc") -__failure __regex("Unreleased reference id=3 alloc_insn=[0-9]+") +__failure __msg("Unreleased reference id=3 alloc_insn={{[0-9]+}}") long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx) { struct node_acquire *n, *m; diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index c604a82e03c4..b0d7158e00c1 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -19,12 +19,10 @@ #define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" -#define TEST_TAG_EXPECT_REGEX_PFX "comment:test_expect_regex=" #define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated=" #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv" #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv" #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv=" -#define TEST_TAG_EXPECT_REGEX_PFX_UNPRIV "comment:test_expect_regex_unpriv=" #define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv=" #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags=" @@ -55,8 +53,9 @@ enum mode { struct expect_msg { const char *substr; /* substring match */ - const char *regex_str; /* regex-based match */ regex_t regex; + bool is_regex; + bool on_next_line; }; struct expected_msgs { @@ -111,7 +110,7 @@ static void free_msgs(struct expected_msgs *msgs) int i; for (i = 0; i < msgs->cnt; i++) - if (msgs->patterns[i].regex_str) + if (msgs->patterns[i].is_regex) regfree(&msgs->patterns[i].regex); free(msgs->patterns); msgs->patterns = NULL; @@ -132,12 +131,71 @@ static void free_test_spec(struct test_spec *spec) spec->unpriv.name = NULL; } -static int push_msg(const char *substr, const char *regex_str, struct expected_msgs *msgs) +/* Compiles regular expression matching pattern. + * Pattern has a special syntax: + * + * pattern := ( | regex)* + * regex := "{{" "}}" + * + * In other words, pattern is a verbatim text with inclusion + * of regular expressions enclosed in "{{" "}}" pairs. + * For example, pattern "foo{{[0-9]+}}" matches strings like + * "foo0", "foo007", etc. + */ +static int compile_regex(const char *pattern, regex_t *regex) +{ + char err_buf[256], buf[256] = {}, *ptr, *buf_end; + const char *original_pattern = pattern; + bool in_regex = false; + int err; + + buf_end = buf + sizeof(buf); + ptr = buf; + while (*pattern && ptr < buf_end - 2) { + if (!in_regex && str_has_pfx(pattern, "{{")) { + in_regex = true; + pattern += 2; + continue; + } + if (in_regex && str_has_pfx(pattern, "}}")) { + in_regex = false; + pattern += 2; + continue; + } + if (in_regex) { + *ptr++ = *pattern++; + continue; + } + /* list of characters that need escaping for extended posix regex */ + if (strchr(".[]\\()*+?{}|^$", *pattern)) { + *ptr++ = '\\'; + *ptr++ = *pattern++; + continue; + } + *ptr++ = *pattern++; + } + if (*pattern) { + PRINT_FAIL("Regexp too long: '%s'\n", original_pattern); + return -EINVAL; + } + if (in_regex) { + PRINT_FAIL("Regexp has open '{{' but no closing '}}': '%s'\n", original_pattern); + return -EINVAL; + } + err = regcomp(regex, buf, REG_EXTENDED | REG_NEWLINE); + if (err != 0) { + regerror(err, regex, err_buf, sizeof(err_buf)); + PRINT_FAIL("Regexp compilation error in '%s': '%s'\n", buf, err_buf); + return -EINVAL; + } + return 0; +} + +static int __push_msg(const char *pattern, bool on_next_line, struct expected_msgs *msgs) { - void *tmp; - int regcomp_res; - char error_msg[100]; struct expect_msg *msg; + void *tmp; + int err; tmp = realloc(msgs->patterns, (1 + msgs->cnt) * sizeof(struct expect_msg)); @@ -147,26 +205,38 @@ static int push_msg(const char *substr, const char *regex_str, struct expected_m } msgs->patterns = tmp; msg = &msgs->patterns[msgs->cnt]; - - if (substr) { - msg->substr = substr; - msg->regex_str = NULL; - } else { - msg->regex_str = regex_str; - msg->substr = NULL; - regcomp_res = regcomp(&msg->regex, regex_str, REG_EXTENDED|REG_NEWLINE); - if (regcomp_res != 0) { - regerror(regcomp_res, &msg->regex, error_msg, sizeof(error_msg)); - PRINT_FAIL("Regexp compilation error in '%s': '%s'\n", - regex_str, error_msg); - return -EINVAL; - } + msg->on_next_line = on_next_line; + msg->substr = pattern; + msg->is_regex = false; + if (strstr(pattern, "{{")) { + err = compile_regex(pattern, &msg->regex); + if (err) + return err; + msg->is_regex = true; } - msgs->cnt += 1; return 0; } +static int clone_msgs(struct expected_msgs *from, struct expected_msgs *to) +{ + struct expect_msg *msg; + int i, err; + + for (i = 0; i < from->cnt; i++) { + msg = &from->patterns[i]; + err = __push_msg(msg->substr, msg->on_next_line, to); + if (err) + return err; + } + return 0; +} + +static int push_msg(const char *substr, struct expected_msgs *msgs) +{ + return __push_msg(substr, false, msgs); +} + static int parse_int(const char *str, int *val, const char *name) { char *end; @@ -320,32 +390,22 @@ static int parse_test_spec(struct test_loader *tester, spec->auxiliary = true; spec->mode_mask |= UNPRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) { - err = push_msg(msg, NULL, &spec->priv.expect_msgs); + err = push_msg(msg, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) { - err = push_msg(msg, NULL, &spec->unpriv.expect_msgs); - if (err) - goto cleanup; - spec->mode_mask |= UNPRIV; - } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX))) { - err = push_msg(NULL, msg, &spec->priv.expect_msgs); - if (err) - goto cleanup; - spec->mode_mask |= PRIV; - } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV))) { - err = push_msg(NULL, msg, &spec->unpriv.expect_msgs); + err = push_msg(msg, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { - err = push_msg(msg, NULL, &spec->priv.expect_xlated); + err = push_msg(msg, &spec->priv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) { - err = push_msg(msg, NULL, &spec->unpriv.expect_xlated); + err = push_msg(msg, &spec->unpriv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= UNPRIV; @@ -457,26 +517,10 @@ static int parse_test_spec(struct test_loader *tester, spec->unpriv.execute = spec->priv.execute; } - if (spec->unpriv.expect_msgs.cnt == 0) { - for (i = 0; i < spec->priv.expect_msgs.cnt; i++) { - struct expect_msg *msg = &spec->priv.expect_msgs.patterns[i]; - - err = push_msg(msg->substr, msg->regex_str, - &spec->unpriv.expect_msgs); - if (err) - goto cleanup; - } - } - if (spec->unpriv.expect_xlated.cnt == 0) { - for (i = 0; i < spec->priv.expect_xlated.cnt; i++) { - struct expect_msg *msg = &spec->priv.expect_xlated.patterns[i]; - - err = push_msg(msg->substr, msg->regex_str, - &spec->unpriv.expect_xlated); - if (err) - goto cleanup; - } - } + if (spec->unpriv.expect_msgs.cnt == 0) + clone_msgs(&spec->priv.expect_msgs, &spec->unpriv.expect_msgs); + if (spec->unpriv.expect_xlated.cnt == 0) + clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated); } spec->valid = true; @@ -542,7 +586,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, struct expect_msg *msg = &msgs->patterns[i]; const char *match = NULL; - if (msg->substr) { + if (!msg->is_regex) { match = strstr(log, msg->substr); if (match) log = match + strlen(msg->substr); @@ -562,8 +606,8 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", j < i ? "MATCHED " : "EXPECTED", - msg->substr ? "SUBSTR" : " REGEX", - msg->substr ?: msg->regex_str); + msg->is_regex ? " REGEX" : "SUBSTR", + msg->substr); } return; } From b991fc52070042468f31b70fb8ccab96ba351f8a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:53 -0700 Subject: [PATCH 0998/1052] selftests/bpf: utility function to get program disassembly after jit This commit adds a utility function to get disassembled text for jited representation of a BPF program designated by file descriptor. Function prototype looks as follows: int get_jited_program_text(int fd, char *text, size_t text_sz) Where 'fd' is a file descriptor for the program, 'text' and 'text_sz' refer to a destination buffer for disassembled text. Output format looks as follows: 18: 77 06 ja L0 1a: 50 pushq %rax 1b: 48 89 e0 movq %rsp, %rax 1e: eb 01 jmp L1 20: 50 L0: pushq %rax 21: 50 L1: pushq %rax ^ ^^^^^^^^ ^ ^^^^^^^^^^^^^^^^^^ | binary insn | textual insn | representation | representation | | instruction offset inferred local label name The code and makefile changes are inspired by jit_disasm.c from bpftool. Use llvm libraries to disassemble BPF program instead of libbfd to avoid issues with disassembly output stability pointed out in [1]. Selftests makefile uses Makefile.feature to detect if LLVM libraries are available. If that is not the case selftests build proceeds but the function returns -EOPNOTSUPP at runtime. [1] commit eb9d1acf634b ("bpftool: Add LLVM as default library for disassembling JIT-ed programs") Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 48 +++- .../selftests/bpf/jit_disasm_helpers.c | 234 ++++++++++++++++++ .../selftests/bpf/jit_disasm_helpers.h | 10 + 4 files changed, 291 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/jit_disasm_helpers.c create mode 100644 tools/testing/selftests/bpf/jit_disasm_helpers.h diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 99ffea1fa5c6..e6533b3400de 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -8,6 +8,7 @@ test_lru_map test_lpm_map test_tag FEATURE-DUMP.libbpf +FEATURE-DUMP.selftests fixdep /test_progs /test_progs-no_alu32 diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7bca9a2b8bc0..6d6d0c29db47 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,6 +33,13 @@ OPT_FLAGS ?= $(if $(RELEASE),-O2,-O0) LIBELF_CFLAGS := $(shell $(PKG_CONFIG) libelf --cflags 2>/dev/null) LIBELF_LIBS := $(shell $(PKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf) +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + CFLAGS += -g $(OPT_FLAGS) -rdynamic \ -Wall -Werror -fno-omit-frame-pointer \ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ @@ -60,6 +67,9 @@ progs/timer_crash.c-CFLAGS := -fno-strict-aliasing progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing +# Some utility functions use LLVM libraries +jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) + ifneq ($(LLVM),) # Silence some warnings when compiled with clang CFLAGS += -Wno-unused-command-line-argument @@ -168,6 +178,31 @@ endef include ../lib.mk +NON_CHECK_FEAT_TARGETS := clean docs-clean +CHECK_FEAT := $(filter-out $(NON_CHECK_FEAT_TARGETS),$(or $(MAKECMDGOALS), "none")) +ifneq ($(CHECK_FEAT),) +FEATURE_USER := .selftests +FEATURE_TESTS := llvm +FEATURE_DISPLAY := $(FEATURE_TESTS) + +# Makefile.feature expects OUTPUT to end with a slash +$(let OUTPUT,$(OUTPUT)/,\ + $(eval include ../../../build/Makefile.feature)) +endif + +ifeq ($(feature-llvm),1) + LLVM_CFLAGS += -DHAVE_LLVM_SUPPORT + LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets + # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict + LLVM_CFLAGS += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags)) + LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) + LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) + LLVM_LDLIBS += -lstdc++ + endif + LLVM_LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags) +endif + SCRATCH_DIR := $(OUTPUT)/tools BUILD_DIR := $(SCRATCH_DIR)/build INCLUDE_DIR := $(SCRATCH_DIR)/include @@ -612,6 +647,10 @@ ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),) include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) endif +# add per extra obj CFGLAGS definitions +$(foreach N,$(patsubst $(TRUNNER_OUTPUT)/%.o,%,$(TRUNNER_EXTRA_OBJS)), \ + $(eval $(TRUNNER_OUTPUT)/$(N).o: CFLAGS += $($(N).c-CFLAGS))) + $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ $(TRUNNER_EXTRA_HDRS) \ @@ -628,6 +667,9 @@ ifneq ($2:$(OUTPUT),:$(shell pwd)) $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif +$(OUTPUT)/$(TRUNNER_BINARY): LDLIBS += $$(LLVM_LDLIBS) +$(OUTPUT)/$(TRUNNER_BINARY): LDFLAGS += $$(LLVM_LDFLAGS) + # some X.test.o files have runtime dependencies on Y.bpf.o files $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) @@ -637,7 +679,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_BPFTOOL) \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) - $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ + $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LDFLAGS) -o $$@ $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool @@ -656,6 +698,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ cap_helpers.c \ unpriv_helpers.c \ netlink_helpers.c \ + jit_disasm_helpers.c \ test_loader.c \ xsk.c \ disasm.c \ @@ -798,7 +841,8 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ no_alu32 cpuv4 bpf_gcc bpf_testmod.ko \ bpf_test_no_cfi.ko \ - liburandom_read.so) + liburandom_read.so) \ + $(OUTPUT)/FEATURE-DUMP.selftests .PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c new file mode 100644 index 000000000000..1b0f1fd267c0 --- /dev/null +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include +#include +#include + +#ifdef HAVE_LLVM_SUPPORT + +#include +#include +#include +#include + +/* The intent is to use get_jited_program_text() for small test + * programs written in BPF assembly, thus assume that 32 local labels + * would be sufficient. + */ +#define MAX_LOCAL_LABELS 32 + +static bool llvm_initialized; + +struct local_labels { + bool print_phase; + __u32 prog_len; + __u32 cnt; + __u32 pcs[MAX_LOCAL_LABELS]; + char names[MAX_LOCAL_LABELS][4]; +}; + +static const char *lookup_symbol(void *data, uint64_t ref_value, uint64_t *ref_type, + uint64_t ref_pc, const char **ref_name) +{ + struct local_labels *labels = data; + uint64_t type = *ref_type; + int i; + + *ref_type = LLVMDisassembler_ReferenceType_InOut_None; + *ref_name = NULL; + if (type != LLVMDisassembler_ReferenceType_In_Branch) + return NULL; + /* Depending on labels->print_phase either discover local labels or + * return a name assigned with local jump target: + * - if print_phase is true and ref_value is in labels->pcs, + * return corresponding labels->name. + * - if print_phase is false, save program-local jump targets + * in labels->pcs; + */ + if (labels->print_phase) { + for (i = 0; i < labels->cnt; ++i) + if (labels->pcs[i] == ref_value) + return labels->names[i]; + } else { + if (labels->cnt < MAX_LOCAL_LABELS && ref_value < labels->prog_len) + labels->pcs[labels->cnt++] = ref_value; + } + return NULL; +} + +static int disasm_insn(LLVMDisasmContextRef ctx, uint8_t *image, __u32 len, __u32 pc, + char *buf, __u32 buf_sz) +{ + int i, cnt; + + cnt = LLVMDisasmInstruction(ctx, image + pc, len - pc, pc, + buf, buf_sz); + if (cnt > 0) + return cnt; + PRINT_FAIL("Can't disasm instruction at offset %d:", pc); + for (i = 0; i < 16 && pc + i < len; ++i) + printf(" %02x", image[pc + i]); + printf("\n"); + return -EINVAL; +} + +static int cmp_u32(const void *_a, const void *_b) +{ + __u32 a = *(__u32 *)_a; + __u32 b = *(__u32 *)_b; + + if (a < b) + return -1; + if (a > b) + return 1; + return 0; +} + +static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) +{ + char *label, *colon, *triple = NULL; + LLVMDisasmContextRef ctx = NULL; + struct local_labels labels = {}; + __u32 *label_pc, pc; + int i, cnt, err = 0; + char buf[64]; + + triple = LLVMGetDefaultTargetTriple(); + ctx = LLVMCreateDisasm(triple, &labels, 0, NULL, lookup_symbol); + if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasm")) { + err = -EINVAL; + goto out; + } + + cnt = LLVMSetDisasmOptions(ctx, LLVMDisassembler_Option_PrintImmHex); + if (!ASSERT_EQ(cnt, 1, "LLVMSetDisasmOptions")) { + err = -EINVAL; + goto out; + } + + /* discover labels */ + labels.prog_len = len; + pc = 0; + while (pc < len) { + cnt = disasm_insn(ctx, image, len, pc, buf, 1); + if (cnt < 0) { + err = cnt; + goto out; + } + pc += cnt; + } + qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); + for (i = 0; i < labels.cnt; ++i) + /* use (i % 100) to avoid format truncation warning */ + snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % 100); + + /* now print with labels */ + labels.print_phase = true; + pc = 0; + while (pc < len) { + cnt = disasm_insn(ctx, image, len, pc, buf, sizeof(buf)); + if (cnt < 0) { + err = cnt; + goto out; + } + label_pc = bsearch(&pc, labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); + label = ""; + colon = ""; + if (label_pc) { + label = labels.names[label_pc - labels.pcs]; + colon = ":"; + } + fprintf(text_out, "%x:\t", pc); + for (i = 0; i < cnt; ++i) + fprintf(text_out, "%02x ", image[pc + i]); + for (i = cnt * 3; i < 12 * 3; ++i) + fputc(' ', text_out); + fprintf(text_out, "%s%s%s\n", label, colon, buf); + pc += cnt; + } + +out: + if (triple) + LLVMDisposeMessage(triple); + if (ctx) + LLVMDisasmDispose(ctx); + return err; +} + +int get_jited_program_text(int fd, char *text, size_t text_sz) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 jited_funcs, len, pc; + __u32 *func_lens = NULL; + FILE *text_out = NULL; + uint8_t *image = NULL; + int i, err = 0; + + if (!llvm_initialized) { + LLVMInitializeAllTargetInfos(); + LLVMInitializeAllTargetMCs(); + LLVMInitializeAllDisassemblers(); + llvm_initialized = 1; + } + + text_out = fmemopen(text, text_sz, "w"); + if (!ASSERT_OK_PTR(text_out, "open_memstream")) { + err = -errno; + goto out; + } + + /* first call is to find out jited program len */ + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd #1")) + goto out; + + len = info.jited_prog_len; + image = malloc(len); + if (!ASSERT_OK_PTR(image, "malloc(info.jited_prog_len)")) { + err = -ENOMEM; + goto out; + } + + jited_funcs = info.nr_jited_func_lens; + func_lens = malloc(jited_funcs * sizeof(__u32)); + if (!ASSERT_OK_PTR(func_lens, "malloc(info.nr_jited_func_lens)")) { + err = -ENOMEM; + goto out; + } + + memset(&info, 0, sizeof(info)); + info.jited_prog_insns = (__u64)image; + info.jited_prog_len = len; + info.jited_func_lens = (__u64)func_lens; + info.nr_jited_func_lens = jited_funcs; + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd #2")) + goto out; + + for (pc = 0, i = 0; i < jited_funcs; ++i) { + fprintf(text_out, "func #%d:\n", i); + disasm_one_func(text_out, image + pc, func_lens[i]); + fprintf(text_out, "\n"); + pc += func_lens[i]; + } + +out: + if (text_out) + fclose(text_out); + if (image) + free(image); + if (func_lens) + free(func_lens); + return err; +} + +#else /* HAVE_LLVM_SUPPORT */ + +int get_jited_program_text(int fd, char *text, size_t text_sz) +{ + if (env.verbosity >= VERBOSE_VERY) + printf("compiled w/o llvm development libraries, can't dis-assembly binary code"); + return -EOPNOTSUPP; +} + +#endif /* HAVE_LLVM_SUPPORT */ diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.h b/tools/testing/selftests/bpf/jit_disasm_helpers.h new file mode 100644 index 000000000000..e6924fd65ecf --- /dev/null +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __JIT_DISASM_HELPERS_H +#define __JIT_DISASM_HELPERS_H + +#include + +int get_jited_program_text(int fd, char *text, size_t text_sz); + +#endif /* __JIT_DISASM_HELPERS_H */ From 7d743e4c759c6bff0131d638158c3472358eda2b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:54 -0700 Subject: [PATCH 0999/1052] selftests/bpf: __jited test tag to check disassembly after jit Allow to verify jit behaviour by writing tests as below: SEC("tp") __arch_x86_64 __jited(" endbr64") __jited(" nopl (%rax,%rax)") __jited(" xorq %rax, %rax") ... __naked void some_test(void) { asm volatile (... ::: __clobber_all); } Allow regular expressions in patterns, same way as in __msg. By default assume that each __jited pattern has to be matched on the next consecutive line of the disassembly, e.g.: __jited(" endbr64") # matched on line N __jited(" nopl (%rax,%rax)") # matched on line N+1 If match occurs on a wrong line an error is reported. To override this behaviour use __jited("..."), e.g.: __jited(" endbr64") # matched on line N __jited("...") # not matched __jited(" nopl (%rax,%rax)") # matched on any line >= N Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 35 +++++ tools/testing/selftests/bpf/test_loader.c | 149 ++++++++++++++++--- 2 files changed, 161 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index cc3ef20a6490..eccaf955e394 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -36,6 +36,39 @@ * Regular expressions could be specified same way as in __msg. * __xlated_unpriv Same as __xlated but for unprivileged mode. * + * __jited Match a line in a disassembly of the jited BPF program. + * Has to be used after __arch_* macro. + * For example: + * + * __arch_x86_64 + * __jited(" endbr64") + * __jited(" nopl (%rax,%rax)") + * __jited(" xorq %rax, %rax") + * ... + * __naked void some_test(void) + * { + * asm volatile (... ::: __clobber_all); + * } + * + * Regular expressions could be included in patterns same way + * as in __msg. + * + * By default assume that each pattern has to be matched on the + * next consecutive line of disassembly, e.g.: + * + * __jited(" endbr64") # matched on line N + * __jited(" nopl (%rax,%rax)") # matched on line N+1 + * + * If match occurs on a wrong line an error is reported. + * To override this behaviour use literal "...", e.g.: + * + * __jited(" endbr64") # matched on line N + * __jited("...") # not matched + * __jited(" nopl (%rax,%rax)") # matched on any line >= N + * + * __jited_unpriv Same as __jited but for unprivileged mode. + * + * * __success Expect program load success in privileged mode. * __success_unpriv Expect program load success in unprivileged mode. * @@ -76,11 +109,13 @@ */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) #define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) +#define __jited(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __jited_unpriv(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) #define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index b0d7158e00c1..d588c612ac03 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -10,6 +10,7 @@ #include "disasm_helpers.h" #include "unpriv_helpers.h" #include "cap_helpers.h" +#include "jit_disasm_helpers.h" #define str_has_pfx(str, pfx) \ (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) @@ -33,6 +34,8 @@ #define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv" #define TEST_BTF_PATH "comment:test_btf_path=" #define TEST_TAG_ARCH "comment:test_arch=" +#define TEST_TAG_JITED_PFX "comment:test_jited=" +#define TEST_TAG_JITED_PFX_UNPRIV "comment:test_jited_unpriv=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xcafe4all @@ -68,6 +71,7 @@ struct test_subspec { bool expect_failure; struct expected_msgs expect_msgs; struct expected_msgs expect_xlated; + struct expected_msgs jited; int retval; bool execute; }; @@ -124,6 +128,8 @@ static void free_test_spec(struct test_spec *spec) free_msgs(&spec->unpriv.expect_msgs); free_msgs(&spec->priv.expect_xlated); free_msgs(&spec->unpriv.expect_xlated); + free_msgs(&spec->priv.jited); + free_msgs(&spec->unpriv.jited); free(spec->priv.name); free(spec->unpriv.name); @@ -237,6 +243,21 @@ static int push_msg(const char *substr, struct expected_msgs *msgs) return __push_msg(substr, false, msgs); } +static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct expected_msgs *msgs) +{ + int err; + + if (strcmp(regex_str, "...") == 0) { + *on_next_line = false; + return 0; + } + err = __push_msg(regex_str, *on_next_line, msgs); + if (err) + return err; + *on_next_line = true; + return 0; +} + static int parse_int(const char *str, int *val, const char *name) { char *end; @@ -320,6 +341,18 @@ enum arch { ARCH_RISCV64 = 0x4, }; +static int get_current_arch(void) +{ +#if defined(__x86_64__) + return ARCH_X86_64; +#elif defined(__aarch64__) + return ARCH_ARM64; +#elif defined(__riscv) && __riscv_xlen == 64 + return ARCH_RISCV64; +#endif + return 0; +} + /* Uses btf_decl_tag attributes to describe the expected test * behavior, see bpf_misc.h for detailed description of each attribute * and attribute combinations. @@ -332,9 +365,13 @@ static int parse_test_spec(struct test_loader *tester, const char *description = NULL; bool has_unpriv_result = false; bool has_unpriv_retval = false; + bool unpriv_jit_on_next_line; + bool jit_on_next_line; + bool collect_jit = false; int func_id, i, err = 0; u32 arch_mask = 0; struct btf *btf; + enum arch arch; memset(spec, 0, sizeof(*spec)); @@ -399,6 +436,30 @@ static int parse_test_spec(struct test_loader *tester, if (err) goto cleanup; spec->mode_mask |= UNPRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_JITED_PFX))) { + if (arch_mask == 0) { + PRINT_FAIL("__jited used before __arch_*"); + goto cleanup; + } + if (collect_jit) { + err = push_disasm_msg(msg, &jit_on_next_line, + &spec->priv.jited); + if (err) + goto cleanup; + spec->mode_mask |= PRIV; + } + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_JITED_PFX_UNPRIV))) { + if (arch_mask == 0) { + PRINT_FAIL("__unpriv_jited used before __arch_*"); + goto cleanup; + } + if (collect_jit) { + err = push_disasm_msg(msg, &unpriv_jit_on_next_line, + &spec->unpriv.jited); + if (err) + goto cleanup; + spec->mode_mask |= UNPRIV; + } } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { err = push_msg(msg, &spec->priv.expect_xlated); if (err) @@ -459,16 +520,20 @@ static int parse_test_spec(struct test_loader *tester, } else if (str_has_pfx(s, TEST_TAG_ARCH)) { val = s + sizeof(TEST_TAG_ARCH) - 1; if (strcmp(val, "X86_64") == 0) { - arch_mask |= ARCH_X86_64; + arch = ARCH_X86_64; } else if (strcmp(val, "ARM64") == 0) { - arch_mask |= ARCH_ARM64; + arch = ARCH_ARM64; } else if (strcmp(val, "RISCV64") == 0) { - arch_mask |= ARCH_RISCV64; + arch = ARCH_RISCV64; } else { PRINT_FAIL("bad arch spec: '%s'", val); err = -EINVAL; goto cleanup; } + arch_mask |= arch; + collect_jit = get_current_arch() == arch; + unpriv_jit_on_next_line = true; + jit_on_next_line = true; } else if (str_has_pfx(s, TEST_BTF_PATH)) { spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1; } @@ -521,6 +586,8 @@ static int parse_test_spec(struct test_loader *tester, clone_msgs(&spec->priv.expect_msgs, &spec->unpriv.expect_msgs); if (spec->unpriv.expect_xlated.cnt == 0) clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated); + if (spec->unpriv.jited.cnt == 0) + clone_msgs(&spec->priv.jited, &spec->unpriv.jited); } spec->valid = true; @@ -575,16 +642,29 @@ static void emit_xlated(const char *xlated, bool force) fprintf(stdout, "XLATED:\n=============\n%s=============\n", xlated); } +static void emit_jited(const char *jited, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "JITED:\n=============\n%s=============\n", jited); +} + static void validate_msgs(char *log_buf, struct expected_msgs *msgs, void (*emit_fn)(const char *buf, bool force)) { + const char *log = log_buf, *prev_match; regmatch_t reg_match[1]; - const char *log = log_buf; + int prev_match_line; + int match_line; int i, j, err; + prev_match_line = -1; + match_line = 0; + prev_match = log; for (i = 0; i < msgs->cnt; i++) { struct expect_msg *msg = &msgs->patterns[i]; - const char *match = NULL; + const char *match = NULL, *pat_status; + bool wrong_line = false; if (!msg->is_regex) { match = strstr(log, msg->substr); @@ -598,19 +678,41 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, } } - if (!match) { + if (match) { + for (; prev_match < match; ++prev_match) + if (*prev_match == '\n') + ++match_line; + wrong_line = msg->on_next_line && prev_match_line >= 0 && + prev_match_line + 1 != match_line; + } + + if (!match || wrong_line) { PRINT_FAIL("expect_msg\n"); if (env.verbosity == VERBOSE_NONE) emit_fn(log_buf, true /*force*/); for (j = 0; j <= i; j++) { msg = &msgs->patterns[j]; + if (j < i) + pat_status = "MATCHED "; + else if (wrong_line) + pat_status = "WRONG LINE"; + else + pat_status = "EXPECTED "; + msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", - j < i ? "MATCHED " : "EXPECTED", + pat_status, msg->is_regex ? " REGEX" : "SUBSTR", msg->substr); } - return; + if (wrong_line) { + fprintf(stderr, + "expecting match at line %d, actual match is at line %d\n", + prev_match_line + 1, match_line); + } + break; } + + prev_match_line = match_line; } } @@ -769,20 +871,6 @@ static int get_xlated_program_text(int prog_fd, char *text, size_t text_sz) return err; } -static bool run_on_current_arch(int arch_mask) -{ - if (arch_mask == 0) - return true; -#if defined(__x86_64__) - return arch_mask & ARCH_X86_64; -#elif defined(__aarch64__) - return arch_mask & ARCH_ARM64; -#elif defined(__riscv) && __riscv_xlen == 64 - return arch_mask & ARCH_RISCV64; -#endif - return false; -} - /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -807,7 +895,7 @@ void run_subtest(struct test_loader *tester, if (!test__start_subtest(subspec->name)) return; - if (!run_on_current_arch(spec->arch_mask)) { + if ((get_current_arch() & spec->arch_mask) == 0) { test__skip(); return; } @@ -884,6 +972,21 @@ void run_subtest(struct test_loader *tester, validate_msgs(tester->log_buf, &subspec->expect_xlated, emit_xlated); } + if (subspec->jited.cnt) { + err = get_jited_program_text(bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err == -EOPNOTSUPP) { + printf("%s:SKIP: jited programs disassembly is not supported,\n", __func__); + printf("%s:SKIP: tests are built w/o LLVM development libs\n", __func__); + test__skip(); + goto tobj_cleanup; + } + if (!ASSERT_EQ(err, 0, "get_jited_program_text")) + goto tobj_cleanup; + emit_jited(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->jited, emit_jited); + } + if (should_do_test_run(spec, subspec)) { /* For some reason test_verifier executes programs * with all capabilities restored. Do the same here. From e5bdd6a8be783eb0c960723d9f37df7ee931d6d6 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:56 -0700 Subject: [PATCH 1000/1052] selftests/bpf: validate jit behaviour for tail calls A program calling sub-program which does a tail call. The idea is to verify instructions generated by jit for tail calls: - in program and sub-program prologues; - for subprogram call instruction; - for tail call itself. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_tailcall_jit.c | 105 ++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index f8f546eba488..cf3662dbd24f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -75,6 +75,7 @@ #include "verifier_stack_ptr.skel.h" #include "verifier_subprog_precision.skel.h" #include "verifier_subreg.skel.h" +#include "verifier_tailcall_jit.skel.h" #include "verifier_typedef.skel.h" #include "verifier_uninit.skel.h" #include "verifier_unpriv.skel.h" @@ -198,6 +199,7 @@ void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } void test_verifier_subprog_precision(void) { RUN(verifier_subprog_precision); } void test_verifier_subreg(void) { RUN(verifier_subreg); } +void test_verifier_tailcall_jit(void) { RUN(verifier_tailcall_jit); } void test_verifier_typedef(void) { RUN(verifier_typedef); } void test_verifier_uninit(void) { RUN(verifier_uninit); } void test_verifier_unpriv(void) { RUN(verifier_unpriv); } diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c new file mode 100644 index 000000000000..06d327cf1e1f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" + +int main(void); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps") = { + .values = { + [0] = (void *) &main, + }, +}; + +__noinline __auxiliary +static __naked int sub(void) +{ + asm volatile ( + "r2 = %[jmp_table] ll;" + "r3 = 0;" + "call 12;" + "exit;" + : + : __imm_addr(jmp_table) + : __clobber_all); +} + +__success +__arch_x86_64 +/* program entry for main(), regular function prologue */ +__jited(" endbr64") +__jited(" nopl (%rax,%rax)") +__jited(" xorq %rax, %rax") +__jited(" pushq %rbp") +__jited(" movq %rsp, %rbp") +/* tail call prologue for program: + * - establish memory location for tail call counter at &rbp[-8]; + * - spill tail_call_cnt_ptr at &rbp[-16]; + * - expect tail call counter to be passed in rax; + * - for entry program rax is a raw counter, value < 33; + * - for tail called program rax is tail_call_cnt_ptr (value > 33). + */ +__jited(" endbr64") +__jited(" cmpq $0x21, %rax") +__jited(" ja L0") +__jited(" pushq %rax") +__jited(" movq %rsp, %rax") +__jited(" jmp L1") +__jited("L0: pushq %rax") /* rbp[-8] = rax */ +__jited("L1: pushq %rax") /* rbp[-16] = rax */ +/* on subprogram call restore rax to be tail_call_cnt_ptr from rbp[-16] + * (cause original rax might be clobbered by this point) + */ +__jited(" movq -0x10(%rbp), %rax") +__jited(" callq 0x{{.*}}") /* call to sub() */ +__jited(" xorl %eax, %eax") +__jited(" leave") +__jited(" retq") +__jited("...") +/* subprogram entry for sub(), regular function prologue */ +__jited(" endbr64") +__jited(" nopl (%rax,%rax)") +__jited(" nopl (%rax)") +__jited(" pushq %rbp") +__jited(" movq %rsp, %rbp") +/* tail call prologue for subprogram address of tail call counter + * stored at rbp[-16]. + */ +__jited(" endbr64") +__jited(" pushq %rax") /* rbp[-8] = rax */ +__jited(" pushq %rax") /* rbp[-16] = rax */ +__jited(" movabsq ${{.*}}, %rsi") /* r2 = &jmp_table */ +__jited(" xorl %edx, %edx") /* r3 = 0 */ +/* bpf_tail_call implementation: + * - load tail_call_cnt_ptr from rbp[-16]; + * - if *tail_call_cnt_ptr < 33, increment it and jump to target; + * - otherwise do nothing. + */ +__jited(" movq -0x10(%rbp), %rax") +__jited(" cmpq $0x21, (%rax)") +__jited(" jae L0") +__jited(" nopl (%rax,%rax)") +__jited(" addq $0x1, (%rax)") /* *tail_call_cnt_ptr += 1 */ +__jited(" popq %rax") +__jited(" popq %rax") +__jited(" jmp {{.*}}") /* jump to tail call tgt */ +__jited("L0: leave") +__jited(" retq") +SEC("tc") +__naked int main(void) +{ + asm volatile ( + "call %[sub];" + "r0 = 0;" + "exit;" + : + : __imm(sub) + : __clobber_all); +} + +char __license[] SEC("license") = "GPL"; From a038eacdbf59e6024c9e8da5df7f293e64cf20de Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:57 -0700 Subject: [PATCH 1001/1052] selftests/bpf: validate __xlated same way as __jited Both __xlated and __jited work with disassembly. It is logical to have both work in a similar manner. This commit updates __xlated macro handling in test_loader.c by making it expect matches on sequential lines, same way as __jited operates. For example: __xlated("1: *(u64 *)(r10 -16) = r1") ;; matched on line N __xlated("3: r0 = &(void __percpu *)(r0)") ;; matched on line N+1 Also: __xlated("1: *(u64 *)(r10 -16) = r1") ;; matched on line N __xlated("...") ;; not matched __xlated("3: r0 = &(void __percpu *)(r0)") ;; mantched on any ;; line >= N Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_nocsr.c | 53 ++++++++++++++++++- tools/testing/selftests/bpf/test_loader.c | 8 ++- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c index a7fe277e5167..666c736d196f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_nocsr.c +++ b/tools/testing/selftests/bpf/progs/verifier_nocsr.c @@ -78,6 +78,7 @@ __naked void canary_arm64_riscv64(void) SEC("raw_tp") __arch_x86_64 __xlated("1: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("3: exit") __success __naked void canary_zero_spills(void) @@ -94,7 +95,9 @@ SEC("raw_tp") __arch_x86_64 __log_level(4) __msg("stack depth 16") __xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r2 = *(u64 *)(r10 -16)") __success __naked void wrong_reg_in_pattern1(void) @@ -113,7 +116,9 @@ __naked void wrong_reg_in_pattern1(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -16) = r6") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r6 = *(u64 *)(r10 -16)") __success __naked void wrong_reg_in_pattern2(void) @@ -132,7 +137,9 @@ __naked void wrong_reg_in_pattern2(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -16) = r0") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r0 = *(u64 *)(r10 -16)") __success __naked void wrong_reg_in_pattern3(void) @@ -151,7 +158,9 @@ __naked void wrong_reg_in_pattern3(void) SEC("raw_tp") __arch_x86_64 __xlated("2: *(u64 *)(r2 -16) = r1") +__xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("6: r1 = *(u64 *)(r10 -16)") __success __naked void wrong_base_in_pattern(void) @@ -171,7 +180,9 @@ __naked void wrong_base_in_pattern(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r2 = 1") __success __naked void wrong_insn_in_pattern(void) @@ -191,7 +202,9 @@ __naked void wrong_insn_in_pattern(void) SEC("raw_tp") __arch_x86_64 __xlated("2: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("6: r1 = *(u64 *)(r10 -8)") __success __naked void wrong_off_in_pattern1(void) @@ -211,7 +224,9 @@ __naked void wrong_off_in_pattern1(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u32 *)(r10 -4) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u32 *)(r10 -4)") __success __naked void wrong_off_in_pattern2(void) @@ -230,7 +245,9 @@ __naked void wrong_off_in_pattern2(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u32 *)(r10 -16) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u32 *)(r10 -16)") __success __naked void wrong_size_in_pattern(void) @@ -249,7 +266,9 @@ __naked void wrong_size_in_pattern(void) SEC("raw_tp") __arch_x86_64 __xlated("2: *(u32 *)(r10 -8) = r1") +__xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("6: r1 = *(u32 *)(r10 -8)") __success __naked void partial_pattern(void) @@ -275,11 +294,15 @@ __xlated("1: r2 = 2") /* not patched, spills for -8, -16 not removed */ __xlated("2: *(u64 *)(r10 -8) = r1") __xlated("3: *(u64 *)(r10 -16) = r2") +__xlated("...") __xlated("5: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("7: r2 = *(u64 *)(r10 -16)") __xlated("8: r1 = *(u64 *)(r10 -8)") /* patched, spills for -24, -32 removed */ +__xlated("...") __xlated("10: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("12: exit") __success __naked void min_stack_offset(void) @@ -308,7 +331,9 @@ __naked void min_stack_offset(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_fixed_read(void) @@ -330,7 +355,9 @@ __naked void bad_fixed_read(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_fixed_write(void) @@ -352,7 +379,9 @@ __naked void bad_fixed_write(void) SEC("raw_tp") __arch_x86_64 __xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("10: r1 = *(u64 *)(r10 -16)") __success __naked void bad_varying_read(void) @@ -379,7 +408,9 @@ __naked void bad_varying_read(void) SEC("raw_tp") __arch_x86_64 __xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("10: r1 = *(u64 *)(r10 -16)") __success __naked void bad_varying_write(void) @@ -406,7 +437,9 @@ __naked void bad_varying_write(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_write_in_subprog(void) @@ -438,7 +471,9 @@ __naked static void bad_write_in_subprog_aux(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_helper_write(void) @@ -466,13 +501,19 @@ SEC("raw_tp") __arch_x86_64 /* main, not patched */ __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") +__xlated("...") __xlated("9: call pc+1") +__xlated("...") __xlated("10: exit") /* subprogram, patched */ __xlated("11: r1 = 1") +__xlated("...") __xlated("13: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("15: exit") __success __naked void invalidate_one_subprog(void) @@ -510,12 +551,16 @@ SEC("raw_tp") __arch_x86_64 /* main */ __xlated("0: r1 = 1") +__xlated("...") __xlated("2: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("4: call pc+1") __xlated("5: exit") /* subprogram */ __xlated("6: r1 = 1") +__xlated("...") __xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("10: *(u64 *)(r10 -16) = r1") __xlated("11: exit") __success @@ -576,7 +621,9 @@ __log_level(4) __msg("stack depth 16") /* may_goto counter at -16 */ __xlated("0: *(u64 *)(r10 -16) =") __xlated("1: r1 = 1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") /* may_goto expansion starts */ __xlated("5: r11 = *(u64 *)(r10 -16)") __xlated("6: if r11 == 0x0 goto pc+3") @@ -623,13 +670,15 @@ __xlated("5: r0 = *(u32 *)(r0 +0)") __xlated("6: r2 =") __xlated("7: r3 = 0") __xlated("8: r4 = 0") +__xlated("...") /* ... part of the inlined bpf_loop */ __xlated("12: *(u64 *)(r10 -32) = r6") __xlated("13: *(u64 *)(r10 -24) = r7") __xlated("14: *(u64 *)(r10 -16) = r8") -/* ... */ +__xlated("...") __xlated("21: call pc+8") /* dummy_loop_callback */ /* ... last insns of the bpf_loop_interaction1 */ +__xlated("...") __xlated("28: r0 = 0") __xlated("29: exit") /* dummy_loop_callback */ @@ -670,7 +719,7 @@ __xlated("5: r0 = *(u32 *)(r0 +0)") __xlated("6: *(u64 *)(r10 -16) = r1") __xlated("7: call") __xlated("8: r1 = *(u64 *)(r10 -16)") -/* ... */ +__xlated("...") /* ... part of the inlined bpf_loop */ __xlated("15: *(u64 *)(r10 -40) = r6") __xlated("16: *(u64 *)(r10 -32) = r7") diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index d588c612ac03..b229dd013355 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -365,6 +365,8 @@ static int parse_test_spec(struct test_loader *tester, const char *description = NULL; bool has_unpriv_result = false; bool has_unpriv_retval = false; + bool unpriv_xlated_on_next_line = true; + bool xlated_on_next_line = true; bool unpriv_jit_on_next_line; bool jit_on_next_line; bool collect_jit = false; @@ -461,12 +463,14 @@ static int parse_test_spec(struct test_loader *tester, spec->mode_mask |= UNPRIV; } } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { - err = push_msg(msg, &spec->priv.expect_xlated); + err = push_disasm_msg(msg, &xlated_on_next_line, + &spec->priv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) { - err = push_msg(msg, &spec->unpriv.expect_xlated); + err = push_disasm_msg(msg, &unpriv_xlated_on_next_line, + &spec->unpriv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= UNPRIV; From b6ab50902724a27f1fc7136927c27d29f9ba01c6 Mon Sep 17 00:00:00 2001 From: Yu Jiaoliang Date: Wed, 21 Aug 2024 15:37:08 +0800 Subject: [PATCH 1002/1052] bpf: Use kmemdup_array instead of kmemdup for multiple allocation Let the kmemdup_array() take care about multiplication and possible overflows. Signed-off-by: Yu Jiaoliang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240821073709.4067177-1-yujiaoliang@vivo.com --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 78a6f746ea0b..ecf2ddf633bf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1265,8 +1265,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ - old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), - GFP_KERNEL | __GFP_NOWARN); + old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), + GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; From 3d2786d65aaa954ebd3fcc033ada433e10da21c4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:01:23 -0700 Subject: [PATCH 1003/1052] bpf: correctly handle malformed BPF_CORE_TYPE_ID_LOCAL relos In case of malformed relocation record of kind BPF_CORE_TYPE_ID_LOCAL referencing a non-existing BTF type, function bpf_core_calc_relo_insn would cause a null pointer deference. Fix this by adding a proper check upper in call stack, as malformed relocation records could be passed from user space. Simplest reproducer is a program: r0 = 0 exit With a single relocation record: .insn_off = 0, /* patch first instruction */ .type_id = 100500, /* this type id does not exist */ .access_str_off = 6, /* offset of string "0" */ .kind = BPF_CORE_TYPE_ID_LOCAL, See the link for original reproducer or next commit for a test case. Fixes: 74753e1462e7 ("libbpf: Replace btf__type_by_id() with btf_type_by_id().") Reported-by: Liu RuiTong Closes: https://lore.kernel.org/bpf/CAK55_s6do7C+DVwbwY_7nKfUz0YLDoiA1v6X3Y9+p0sWzipFSA@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822080124.2995724-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c9338fb397fc..5de424d3a795 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8910,6 +8910,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, struct bpf_core_cand_list cands = {}; struct bpf_core_relo_res targ_res; struct bpf_core_spec *specs; + const struct btf_type *type; int err; /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" @@ -8919,6 +8920,13 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!specs) return -ENOMEM; + type = btf_type_by_id(ctx->btf, relo->type_id); + if (!type) { + bpf_log(ctx->log, "relo #%u: bad type id %u\n", + relo_idx, relo->type_id); + return -EINVAL; + } + if (need_cands) { struct bpf_cand_cache *cc; int i; From 110bbd3a2ed71f3cf4d9438d62f22f591952fc3b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:01:24 -0700 Subject: [PATCH 1004/1052] selftests/bpf: test for malformed BPF_CORE_TYPE_ID_LOCAL relocation Check that verifier rejects BPF program containing relocation pointing to non-existent BTF type. To force relocation resolution on kernel side test case uses bpf_attr->core_relos field. This field is not exposed by libbpf, so directly do BPF system call in the test. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822080124.2995724-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/core_reloc_raw.c | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c b/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c new file mode 100644 index 000000000000..a18d3680fb16 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Test cases that can't load programs using libbpf and need direct + * BPF syscall access + */ + +#include +#include +#include + +#include "test_progs.h" +#include "test_btf.h" +#include "bpf/libbpf_internal.h" + +static char log[16 * 1024]; + +/* Check that verifier rejects BPF program containing relocation + * pointing to non-existent BTF type. + */ +static void test_bad_local_id(void) +{ + struct test_btf { + struct btf_header hdr; + __u32 types[15]; + char strings[128]; + } raw_btf = { + .hdr = { + .magic = BTF_MAGIC, + .version = BTF_VERSION, + .hdr_len = sizeof(struct btf_header), + .type_off = 0, + .type_len = sizeof(raw_btf.types), + .str_off = offsetof(struct test_btf, strings) - + offsetof(struct test_btf, types), + .str_len = sizeof(raw_btf.strings), + }, + .types = { + BTF_PTR_ENC(0), /* [1] void* */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [2] int */ + BTF_FUNC_PROTO_ENC(2, 1), /* [3] int (*)(void*) */ + BTF_FUNC_PROTO_ARG_ENC(8, 1), + BTF_FUNC_ENC(8, 3) /* [4] FUNC 'foo' type_id=2 */ + }, + .strings = "\0int\0 0\0foo\0" + }; + __u32 log_level = 1 | 2 | 4; + LIBBPF_OPTS(bpf_btf_load_opts, opts, + .log_buf = log, + .log_size = sizeof(log), + .log_level = log_level, + ); + struct bpf_insn insns[] = { + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + struct bpf_func_info funcs[] = { + { + .insn_off = 0, + .type_id = 4, + } + }; + struct bpf_core_relo relos[] = { + { + .insn_off = 0, /* patch first instruction (r0 = 0) */ + .type_id = 100500, /* !!! this type id does not exist */ + .access_str_off = 6, /* offset of "0" */ + .kind = BPF_CORE_TYPE_ID_LOCAL, + } + }; + union bpf_attr attr; + int saved_errno; + int prog_fd = -1; + int btf_fd = -1; + + btf_fd = bpf_btf_load(&raw_btf, sizeof(raw_btf), &opts); + saved_errno = errno; + if (btf_fd < 0 || env.verbosity > VERBOSE_NORMAL) { + printf("-------- BTF load log start --------\n"); + printf("%s", log); + printf("-------- BTF load log end ----------\n"); + } + if (btf_fd < 0) { + PRINT_FAIL("bpf_btf_load() failed, errno=%d\n", saved_errno); + return; + } + + log[0] = 0; + memset(&attr, 0, sizeof(attr)); + attr.prog_btf_fd = btf_fd; + attr.prog_type = BPF_TRACE_RAW_TP; + attr.license = (__u64)"GPL"; + attr.insns = (__u64)&insns; + attr.insn_cnt = sizeof(insns) / sizeof(*insns); + attr.log_buf = (__u64)log; + attr.log_size = sizeof(log); + attr.log_level = log_level; + attr.func_info = (__u64)funcs; + attr.func_info_cnt = sizeof(funcs) / sizeof(*funcs); + attr.func_info_rec_size = sizeof(*funcs); + attr.core_relos = (__u64)relos; + attr.core_relo_cnt = sizeof(relos) / sizeof(*relos); + attr.core_relo_rec_size = sizeof(*relos); + prog_fd = sys_bpf_prog_load(&attr, sizeof(attr), 1); + saved_errno = errno; + if (prog_fd < 0 || env.verbosity > VERBOSE_NORMAL) { + printf("-------- program load log start --------\n"); + printf("%s", log); + printf("-------- program load log end ----------\n"); + } + if (prog_fd >= 0) { + PRINT_FAIL("sys_bpf_prog_load() expected to fail\n"); + goto out; + } + ASSERT_HAS_SUBSTR(log, "relo #0: bad type id 100500", "program load log"); + +out: + close(prog_fd); + close(btf_fd); +} + +void test_core_reloc_raw(void) +{ + if (test__start_subtest("bad_local_id")) + test_bad_local_id(); +} From 6d641ca50d7ec7d5e4e889c3f8ea22afebc2a403 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 11 Aug 2024 18:13:33 +0200 Subject: [PATCH 1005/1052] bpf: Fix percpu address space issues In arraymap.c: In bpf_array_map_seq_start() and bpf_array_map_seq_next() cast return values from the __percpu address space to the generic address space via uintptr_t [1]. Correct the declaration of pptr pointer in __bpf_array_map_seq_show() to void __percpu * and cast the value from the generic address space to the __percpu address space via uintptr_t [1]. In hashtab.c: Assign the return value from bpf_mem_cache_alloc() to void pointer and cast the value to void __percpu ** (void pointer to percpu void pointer) before dereferencing. In memalloc.c: Explicitly declare __percpu variables. Cast obj to void __percpu **. In helpers.c: Cast ptr in BPF_CALL_1 and BPF_CALL_2 from generic address space to __percpu address space via const uintptr_t [1]. Found by GCC's named address space checks. There were no changes in the resulting object files. [1] https://sparse.docs.kernel.org/en/latest/annotations.html#address-space-name Signed-off-by: Uros Bizjak Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Eduard Zingerman Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240811161414.56744-1-ubizjak@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 8 ++++---- kernel/bpf/hashtab.c | 9 +++++---- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/memalloc.c | 12 ++++++------ 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 188e3c2effb2..a43e62e2a8bb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -600,7 +600,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) - return array->pptrs[index]; + return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } @@ -619,7 +619,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) - return array->pptrs[index]; + return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } @@ -632,7 +632,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; - void __percpu **pptr; + void __percpu *pptr; u32 size; meta.seq = seq; @@ -648,7 +648,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) if (!info->percpu_value_buf) { ctx.value = v; } else { - pptr = v; + pptr = (void __percpu *)(uintptr_t)v; size = array->elem_size; for_each_possible_cpu(cpu) { copy_map_value_long(map, info->percpu_value_buf + off, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index be1f64c20125..45c7195b65ba 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1049,14 +1049,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); - if (!pptr) { + void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma); + + if (!ptr) { bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } - l_new->ptr_to_pptr = pptr; - pptr = *(void **)pptr; + l_new->ptr_to_pptr = ptr; + pptr = *(void __percpu **)ptr; } pcpu_init_value(htab, pptr, value, onallcpus); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 12e3aa40b180..ccca6fe0367c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -715,7 +715,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) if (cpu >= nr_cpu_ids) return (unsigned long)NULL; - return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); + return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu); } const struct bpf_func_proto bpf_per_cpu_ptr_proto = { @@ -728,7 +728,7 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = { BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) { - return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); + return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr); } const struct bpf_func_proto bpf_this_cpu_ptr_proto = { diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index dec892ded031..b3858a76e0b3 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -138,8 +138,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head) static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) { if (c->percpu_size) { - void **obj = kmalloc_node(c->percpu_size, flags, node); - void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); + void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); + void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); if (!obj || !pptr) { free_percpu(pptr); @@ -253,7 +253,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) static void free_one(void *obj, bool percpu) { if (percpu) { - free_percpu(((void **)obj)[1]); + free_percpu(((void __percpu **)obj)[1]); kfree(obj); return; } @@ -509,8 +509,8 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - struct bpf_mem_caches *cc, __percpu *pcc; - struct bpf_mem_cache *c, __percpu *pc; + struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; + struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; @@ -591,7 +591,7 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) { - struct bpf_mem_caches *cc, __percpu *pcc; + struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; int cpu, i, unit_size, percpu_size; struct obj_cgroup *objcg; struct bpf_mem_cache *c; From ae010757a55b57c8b82628e8ea9b7da2269131d9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:07 -0700 Subject: [PATCH 1006/1052] bpf: rename nocsr -> bpf_fastcall in verifier Attribute used by LLVM implementation of the feature had been changed from no_caller_saved_registers to bpf_fastcall (see [1]). This commit replaces references to nocsr by references to bpf_fastcall to keep LLVM and Kernel parts in sync. [1] https://github.com/llvm/llvm-project/pull/105417 Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +- include/linux/bpf_verifier.h | 18 ++--- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 143 +++++++++++++++++------------------ 4 files changed, 84 insertions(+), 85 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f0192c173ed8..00dc4dd28cbd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -808,12 +808,12 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; bool might_sleep; - /* set to true if helper follows contract for gcc/llvm - * attribute no_caller_saved_registers: + /* set to true if helper follows contract for llvm + * attribute bpf_fastcall: * - void functions do not scratch r0 * - functions taking N arguments scratch only registers r1-rN */ - bool allow_nocsr; + bool allow_fastcall; enum bpf_return_type ret_type; union { struct { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5cea15c81b8a..634a302a39e3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -577,13 +577,13 @@ struct bpf_insn_aux_data { bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ /* true if STX or LDX instruction is a part of a spill/fill - * pattern for a no_caller_saved_registers call. + * pattern for a bpf_fastcall call. */ - u8 nocsr_pattern:1; + u8 fastcall_pattern:1; /* for CALL instructions, a number of spill/fill pairs in the - * no_caller_saved_registers pattern. + * bpf_fastcall pattern. */ - u8 nocsr_spills_num:3; + u8 fastcall_spills_num:3; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ @@ -653,10 +653,10 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; - /* offsets in range [stack_depth .. nocsr_stack_off) - * are used for no_caller_saved_registers spills and fills. + /* offsets in range [stack_depth .. fastcall_stack_off) + * are used for bpf_fastcall spills and fills. */ - s16 nocsr_stack_off; + s16 fastcall_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; @@ -664,8 +664,8 @@ struct bpf_subprog_info { bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; - /* true if nocsr stack region is used by functions that can't be inlined */ - bool keep_nocsr_stack: 1; + /* true if bpf_fastcall stack region is used by functions that can't be inlined */ + bool keep_fastcall_stack: 1; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ccca6fe0367c..4105b4af6e03 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -158,7 +158,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, - .allow_nocsr = true, + .allow_fastcall = true, }; BPF_CALL_0(bpf_get_numa_node_id) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 920d7c5fe944..0dfd91f36417 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4579,28 +4579,28 @@ static int get_reg_width(struct bpf_reg_state *reg) return fls64(reg->umax_value); } -/* See comment for mark_nocsr_pattern_for_call() */ -static void check_nocsr_stack_contract(struct bpf_verifier_env *env, struct bpf_func_state *state, - int insn_idx, int off) +/* See comment for mark_fastcall_pattern_for_call() */ +static void check_fastcall_stack_contract(struct bpf_verifier_env *env, + struct bpf_func_state *state, int insn_idx, int off) { struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; struct bpf_insn_aux_data *aux = env->insn_aux_data; int i; - if (subprog->nocsr_stack_off <= off || aux[insn_idx].nocsr_pattern) + if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern) return; - /* access to the region [max_stack_depth .. nocsr_stack_off) - * from something that is not a part of the nocsr pattern, - * disable nocsr rewrites for current subprogram by setting - * nocsr_stack_off to a value smaller than any possible offset. + /* access to the region [max_stack_depth .. fastcall_stack_off) + * from something that is not a part of the fastcall pattern, + * disable fastcall rewrites for current subprogram by setting + * fastcall_stack_off to a value smaller than any possible offset. */ - subprog->nocsr_stack_off = S16_MIN; - /* reset nocsr aux flags within subprogram, + subprog->fastcall_stack_off = S16_MIN; + /* reset fastcall aux flags within subprogram, * happens at most once per subprogram */ for (i = subprog->start; i < (subprog + 1)->start; ++i) { - aux[i].nocsr_spills_num = 0; - aux[i].nocsr_pattern = 0; + aux[i].fastcall_spills_num = 0; + aux[i].fastcall_pattern = 0; } } @@ -4652,7 +4652,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; - check_nocsr_stack_contract(env, state, insn_idx, off); + check_fastcall_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { bool reg_value_fits; @@ -4787,7 +4787,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, return err; } - check_nocsr_stack_contract(env, state, insn_idx, min_off); + check_fastcall_stack_contract(env, state, insn_idx, min_off); /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { u8 new_type, *stype; @@ -4926,7 +4926,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); - check_nocsr_stack_contract(env, state, env->insn_idx, off); + check_fastcall_stack_contract(env, state, env->insn_idx, off); if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -5087,7 +5087,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, min_off = reg->smin_value + off; max_off = reg->smax_value + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); - check_nocsr_stack_contract(env, ptr_state, env->insn_idx, min_off); + check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; } @@ -6804,13 +6804,13 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; int min_valid_off, max_bpf_stack; - /* If accessing instruction is a spill/fill from nocsr pattern, + /* If accessing instruction is a spill/fill from bpf_fastcall pattern, * add room for all caller saved registers below MAX_BPF_STACK. - * In case if nocsr rewrite won't happen maximal stack depth + * In case if bpf_fastcall rewrite won't happen maximal stack depth * would be checked by check_max_stack_depth_subprog(). */ max_bpf_stack = MAX_BPF_STACK; - if (aux->nocsr_pattern) + if (aux->fastcall_pattern) max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; if (t == BPF_WRITE || env->allow_uninit_stack) @@ -16118,12 +16118,12 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Return a bitmask specifying which caller saved registers are * clobbered by a call to a helper *as if* this helper follows - * no_caller_saved_registers contract: + * bpf_fastcall contract: * - includes R0 if function is non-void; * - includes R1-R5 if corresponding parameter has is described * in the function prototype. */ -static u32 helper_nocsr_clobber_mask(const struct bpf_func_proto *fn) +static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) { u8 mask; int i; @@ -16138,8 +16138,8 @@ static u32 helper_nocsr_clobber_mask(const struct bpf_func_proto *fn) } /* True if do_misc_fixups() replaces calls to helper number 'imm', - * replacement patch is presumed to follow no_caller_saved_registers contract - * (see mark_nocsr_pattern_for_call() below). + * replacement patch is presumed to follow bpf_fastcall contract + * (see mark_fastcall_pattern_for_call() below). */ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) { @@ -16153,7 +16153,7 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* GCC and LLVM define a no_caller_saved_registers function attribute. +/* LLVM define a bpf_fastcall function attribute. * This attribute means that function scratches only some of * the caller saved registers defined by ABI. * For BPF the set of such registers could be defined as follows: @@ -16163,13 +16163,12 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * * The contract between kernel and clang allows to simultaneously use * such functions and maintain backwards compatibility with old - * kernels that don't understand no_caller_saved_registers calls - * (nocsr for short): + * kernels that don't understand bpf_fastcall calls: * - * - for nocsr calls clang allocates registers as-if relevant r0-r5 + * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5 * registers are not scratched by the call; * - * - as a post-processing step, clang visits each nocsr call and adds + * - as a post-processing step, clang visits each bpf_fastcall call and adds * spill/fill for every live r0-r5; * * - stack offsets used for the spill/fill are allocated as lowest @@ -16177,11 +16176,11 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * purposes; * * - when kernel loads a program, it looks for such patterns - * (nocsr function surrounded by spills/fills) and checks if - * spill/fill stack offsets are used exclusively in nocsr patterns; + * (bpf_fastcall function surrounded by spills/fills) and checks if + * spill/fill stack offsets are used exclusively in fastcall patterns; * * - if so, and if verifier or current JIT inlines the call to the - * nocsr function (e.g. a helper call), kernel removes unnecessary + * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary * spill/fill pairs; * * - when old kernel loads a program, presence of spill/fill pairs @@ -16200,22 +16199,22 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * r0 += r2; * exit; * - * The purpose of mark_nocsr_pattern_for_call is to: + * The purpose of mark_fastcall_pattern_for_call is to: * - look for such patterns; - * - mark spill and fill instructions in env->insn_aux_data[*].nocsr_pattern; - * - mark set env->insn_aux_data[*].nocsr_spills_num for call instruction; - * - update env->subprog_info[*]->nocsr_stack_off to find an offset - * at which nocsr spill/fill stack slots start; - * - update env->subprog_info[*]->keep_nocsr_stack. + * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern; + * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction; + * - update env->subprog_info[*]->fastcall_stack_off to find an offset + * at which bpf_fastcall spill/fill stack slots start; + * - update env->subprog_info[*]->keep_fastcall_stack. * - * The .nocsr_pattern and .nocsr_stack_off are used by - * check_nocsr_stack_contract() to check if every stack access to - * nocsr spill/fill stack slot originates from spill/fill - * instructions, members of nocsr patterns. + * The .fastcall_pattern and .fastcall_stack_off are used by + * check_fastcall_stack_contract() to check if every stack access to + * fastcall spill/fill stack slot originates from spill/fill + * instructions, members of fastcall patterns. * - * If such condition holds true for a subprogram, nocsr patterns could - * be rewritten by remove_nocsr_spills_fills(). - * Otherwise nocsr patterns are not changed in the subprogram + * If such condition holds true for a subprogram, fastcall patterns could + * be rewritten by remove_fastcall_spills_fills(). + * Otherwise bpf_fastcall patterns are not changed in the subprogram * (code, presumably, generated by an older clang version). * * For example, it is *not* safe to remove spill/fill below: @@ -16228,9 +16227,9 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * r0 += r1; exit; * exit; */ -static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, - struct bpf_subprog_info *subprog, - int insn_idx, s16 lowest_off) +static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, + struct bpf_subprog_info *subprog, + int insn_idx, s16 lowest_off) { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; @@ -16245,8 +16244,8 @@ static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, if (get_helper_proto(env, call->imm, &fn) < 0) /* error would be reported later */ return; - clobbered_regs_mask = helper_nocsr_clobber_mask(fn); - can_be_inlined = fn->allow_nocsr && + clobbered_regs_mask = helper_fastcall_clobber_mask(fn); + can_be_inlined = fn->allow_fastcall && (verifier_inlines_helper_call(env, call->imm) || bpf_jit_inlines_helper_call(call->imm)); } @@ -16289,36 +16288,36 @@ static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, if (stx->off != off || ldx->off != off) break; expected_regs_mask &= ~BIT(stx->src_reg); - env->insn_aux_data[insn_idx - i].nocsr_pattern = 1; - env->insn_aux_data[insn_idx + i].nocsr_pattern = 1; + env->insn_aux_data[insn_idx - i].fastcall_pattern = 1; + env->insn_aux_data[insn_idx + i].fastcall_pattern = 1; } if (i == 1) return; - /* Conditionally set 'nocsr_spills_num' to allow forward + /* Conditionally set 'fastcall_spills_num' to allow forward * compatibility when more helper functions are marked as - * nocsr at compile time than current kernel supports, e.g: + * bpf_fastcall at compile time than current kernel supports, e.g: * * 1: *(u64 *)(r10 - 8) = r1 - * 2: call A ;; assume A is nocsr for current kernel + * 2: call A ;; assume A is bpf_fastcall for current kernel * 3: r1 = *(u64 *)(r10 - 8) * 4: *(u64 *)(r10 - 8) = r1 - * 5: call B ;; assume B is not nocsr for current kernel + * 5: call B ;; assume B is not bpf_fastcall for current kernel * 6: r1 = *(u64 *)(r10 - 8) * - * There is no need to block nocsr rewrite for such program. - * Set 'nocsr_pattern' for both calls to keep check_nocsr_stack_contract() happy, - * don't set 'nocsr_spills_num' for call B so that remove_nocsr_spills_fills() + * There is no need to block bpf_fastcall rewrite for such program. + * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy, + * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ if (can_be_inlined) - env->insn_aux_data[insn_idx].nocsr_spills_num = i - 1; + env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else - subprog->keep_nocsr_stack = 1; - subprog->nocsr_stack_off = min(subprog->nocsr_stack_off, off); + subprog->keep_fastcall_stack = 1; + subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off); } -static int mark_nocsr_patterns(struct bpf_verifier_env *env) +static int mark_fastcall_patterns(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn; @@ -16335,12 +16334,12 @@ static int mark_nocsr_patterns(struct bpf_verifier_env *env) continue; lowest_off = min(lowest_off, insn->off); } - /* use this offset to find nocsr patterns */ + /* use this offset to find fastcall patterns */ for (i = subprog->start; i < (subprog + 1)->start; ++i) { insn = env->prog->insnsi + i; if (insn->code != (BPF_JMP | BPF_CALL)) continue; - mark_nocsr_pattern_for_call(env, subprog, i, lowest_off); + mark_fastcall_pattern_for_call(env, subprog, i, lowest_off); } } return 0; @@ -21244,10 +21243,10 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env) return 0; } -/* Remove unnecessary spill/fill pairs, members of nocsr pattern, +/* Remove unnecessary spill/fill pairs, members of fastcall pattern, * adjust subprograms stack depth when possible. */ -static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) +static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn_aux_data *aux = env->insn_aux_data; @@ -21258,8 +21257,8 @@ static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { - if (aux[i].nocsr_spills_num > 0) { - spills_num = aux[i].nocsr_spills_num; + if (aux[i].fastcall_spills_num > 0) { + spills_num = aux[i].fastcall_spills_num; /* NOPs would be removed by opt_remove_nops() */ for (j = 1; j <= spills_num; ++j) { *(insn - j) = NOP; @@ -21268,8 +21267,8 @@ static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) modified = true; } if ((subprog + 1)->start == i + 1) { - if (modified && !subprog->keep_nocsr_stack) - subprog->stack_depth = -subprog->nocsr_stack_off; + if (modified && !subprog->keep_fastcall_stack) + subprog->stack_depth = -subprog->fastcall_stack_off; subprog++; modified = false; } @@ -22192,7 +22191,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = mark_nocsr_patterns(env); + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -22209,7 +22208,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 * allocate additional slots. */ if (ret == 0) - ret = remove_nocsr_spills_fills(env); + ret = remove_fastcall_spills_fills(env); if (ret == 0) ret = check_max_stack_depth(env); From adec67d372fecd97585d76dbebb610d62ec9d2cc Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:08 -0700 Subject: [PATCH 1007/1052] selftests/bpf: rename nocsr -> bpf_fastcall in selftests Attribute used by LLVM implementation of the feature had been changed from no_caller_saved_registers to bpf_fastcall (see [1]). This commit replaces references to nocsr by references to bpf_fastcall to keep LLVM and selftests parts in sync. [1] https://github.com/llvm/llvm-project/pull/105417 Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 4 +-- ...rifier_nocsr.c => verifier_bpf_fastcall.c} | 26 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) rename tools/testing/selftests/bpf/progs/{verifier_nocsr.c => verifier_bpf_fastcall.c} (95%) diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index cf3662dbd24f..80a90c627182 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -53,7 +53,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" -#include "verifier_nocsr.skel.h" +#include "verifier_bpf_fastcall.skel.h" #include "verifier_or_jmp32_k.skel.h" #include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" @@ -177,7 +177,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } -void test_verifier_nocsr(void) { RUN(verifier_nocsr); } +void test_verifier_bpf_fastcall(void) { RUN(verifier_bpf_fastcall); } void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); } void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c similarity index 95% rename from tools/testing/selftests/bpf/progs/verifier_nocsr.c rename to tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index 666c736d196f..e30ab9fe5096 100644 --- a/tools/testing/selftests/bpf/progs/verifier_nocsr.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -39,7 +39,7 @@ __naked void simple(void) : __clobber_all); } -/* The logic for detecting and verifying nocsr pattern is the same for +/* The logic for detecting and verifying bpf_fastcall pattern is the same for * any arch, however x86 differs from arm64 or riscv64 in a way * bpf_get_smp_processor_id is rewritten: * - on x86 it is done by verifier @@ -52,7 +52,7 @@ __naked void simple(void) * * It is really desirable to check instruction indexes in the xlated * patterns, so add this canary test to check that function rewrite by - * jit is correctly processed by nocsr logic, keep the rest of the + * jit is correctly processed by bpf_fastcall logic, keep the rest of the * tests as x86. */ SEC("raw_tp") @@ -463,7 +463,7 @@ __naked static void bad_write_in_subprog_aux(void) { asm volatile ( "r0 = 1;" - "*(u64 *)(r1 - 0) = r0;" /* invalidates nocsr contract for caller: */ + "*(u64 *)(r1 - 0) = r0;" /* invalidates bpf_fastcall contract for caller: */ "exit;" /* caller stack at -8 used outside of the pattern */ ::: __clobber_all); } @@ -480,7 +480,7 @@ __naked void bad_helper_write(void) { asm volatile ( "r1 = 1;" - /* nocsr pattern with stack offset -8 */ + /* bpf_fastcall pattern with stack offset -8 */ "*(u64 *)(r10 - 8) = r1;" "call %[bpf_get_smp_processor_id];" "r1 = *(u64 *)(r10 - 8);" @@ -488,7 +488,7 @@ __naked void bad_helper_write(void) "r1 += -8;" "r2 = 1;" "r3 = 42;" - /* read dst is fp[-8], thus nocsr rewrite not applied */ + /* read dst is fp[-8], thus bpf_fastcall rewrite not applied */ "call %[bpf_probe_read_kernel];" "exit;" : @@ -598,7 +598,7 @@ __arch_x86_64 __log_level(4) __msg("stack depth 8") __xlated("2: r0 = &(void __percpu *)(r0)") __success -__naked void helper_call_does_not_prevent_nocsr(void) +__naked void helper_call_does_not_prevent_bpf_fastcall(void) { asm volatile ( "r1 = 1;" @@ -689,7 +689,7 @@ __naked int bpf_loop_interaction1(void) { asm volatile ( "r1 = 1;" - /* nocsr stack region at -16, but could be removed */ + /* bpf_fastcall stack region at -16, but could be removed */ "*(u64 *)(r10 - 16) = r1;" "call %[bpf_get_smp_processor_id];" "r1 = *(u64 *)(r10 - 16);" @@ -729,7 +729,7 @@ __naked int bpf_loop_interaction2(void) { asm volatile ( "r1 = 42;" - /* nocsr stack region at -16, cannot be removed */ + /* bpf_fastcall stack region at -16, cannot be removed */ "*(u64 *)(r10 - 16) = r1;" "call %[bpf_get_smp_processor_id];" "r1 = *(u64 *)(r10 - 16);" @@ -759,8 +759,8 @@ __msg("stack depth 512+0") __xlated("r0 = &(void __percpu *)(r0)") __success /* cumulative_stack_depth() stack usage is MAX_BPF_STACK, - * called subprogram uses an additional slot for nocsr spill/fill, - * since nocsr spill/fill could be removed the program still fits + * called subprogram uses an additional slot for bpf_fastcall spill/fill, + * since bpf_fastcall spill/fill could be removed the program still fits * in MAX_BPF_STACK and should be accepted. */ __naked int cumulative_stack_depth(void) @@ -798,7 +798,7 @@ __xlated("3: r0 = &(void __percpu *)(r0)") __xlated("4: r0 = *(u32 *)(r0 +0)") __xlated("5: exit") __success -__naked int nocsr_max_stack_ok(void) +__naked int bpf_fastcall_max_stack_ok(void) { asm volatile( "r1 = 42;" @@ -820,7 +820,7 @@ __arch_x86_64 __log_level(4) __msg("stack depth 520") __failure -__naked int nocsr_max_stack_fail(void) +__naked int bpf_fastcall_max_stack_fail(void) { asm volatile( "r1 = 42;" @@ -828,7 +828,7 @@ __naked int nocsr_max_stack_fail(void) "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" "call %[bpf_get_smp_processor_id];" "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" - /* call to prandom blocks nocsr rewrite */ + /* call to prandom blocks bpf_fastcall rewrite */ "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" "call %[bpf_get_prandom_u32];" "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" From b2ee6d27e9c6be0409e96591dcee62032a8e0156 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:09 -0700 Subject: [PATCH 1008/1052] bpf: support bpf_fastcall patterns for kfuncs Recognize bpf_fastcall patterns around kfunc calls. For example, suppose bpf_cast_to_kern_ctx() follows bpf_fastcall contract (which it does), in such a case allow verifier to rewrite BPF program below: r2 = 1; *(u64 *)(r10 - 32) = r2; call %[bpf_cast_to_kern_ctx]; r2 = *(u64 *)(r10 - 32); r0 = r2; By removing the spill/fill pair: r2 = 1; call %[bpf_cast_to_kern_ctx]; r0 = r2; Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0dfd91f36417..94308cc7c503 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16125,7 +16125,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, */ static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) { - u8 mask; + u32 mask; int i; mask = 0; @@ -16153,6 +16153,26 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } +/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ +static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +{ + u32 vlen, i, mask; + + vlen = btf_type_vlen(meta->func_proto); + mask = 0; + if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) + mask |= BIT(BPF_REG_0); + for (i = 0; i < vlen; ++i) + mask |= BIT(BPF_REG_1 + i); + return mask; +} + +/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ +static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) +{ + return false; +} + /* LLVM define a bpf_fastcall function attribute. * This attribute means that function scratches only some of * the caller saved registers defined by ABI. @@ -16250,6 +16270,19 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, bpf_jit_inlines_helper_call(call->imm)); } + if (bpf_pseudo_kfunc_call(call)) { + struct bpf_kfunc_call_arg_meta meta; + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return; + + clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); + can_be_inlined = is_fastcall_kfunc_call(&meta); + } + if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) return; From 40609093247b586ee6f8ca8f2b30c7d583c6fd25 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:10 -0700 Subject: [PATCH 1009/1052] bpf: allow bpf_fastcall for bpf_cast_to_kern_ctx and bpf_rdonly_cast do_misc_fixups() relaces bpf_cast_to_kern_ctx() and bpf_rdonly_cast() by a single instruction "r0 = r1". This follows bpf_fastcall contract. This commit allows bpf_fastcall pattern rewrite for these two functions in order to use them in bpf_fastcall selftests. Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 94308cc7c503..543b8c9edac7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16170,6 +16170,9 @@ static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) /* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) { + if (meta->btf == btf_vmlinux) + return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]; return false; } From f406026fefa745ff9a3153507cc3b9a87371d954 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:11 -0700 Subject: [PATCH 1010/1052] selftests/bpf: by default use arch mask allowing all archs If test case does not specify architecture via __arch_* macro consider that it should be run for all architectures. Fixes: 7d743e4c759c ("selftests/bpf: __jited test tag to check disassembly after jit") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index b229dd013355..2ca9b73e5a6b 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -543,7 +543,7 @@ static int parse_test_spec(struct test_loader *tester, } } - spec->arch_mask = arch_mask; + spec->arch_mask = arch_mask ?: -1; if (spec->mode_mask == 0) spec->mode_mask = PRIV; From 8c2e043daadad021fc501ac64cce131f48c3ca46 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:12 -0700 Subject: [PATCH 1011/1052] selftests/bpf: check if bpf_fastcall is recognized for kfuncs Use kfunc_bpf_cast_to_kern_ctx() and kfunc_bpf_rdonly_cast() to verify that bpf_fastcall pattern is recognized for kfunc calls. Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_bpf_fastcall.c | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index e30ab9fe5096..9da97d2efcd9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -2,8 +2,11 @@ #include #include +#include #include "../../../include/linux/filter.h" #include "bpf_misc.h" +#include +#include "bpf_kfuncs.h" SEC("raw_tp") __arch_x86_64 @@ -842,4 +845,56 @@ __naked int bpf_fastcall_max_stack_fail(void) ); } +SEC("cgroup/getsockname_unix") +__xlated("0: r2 = 1") +/* bpf_cast_to_kern_ctx is replaced by a single assignment */ +__xlated("1: r0 = r1") +__xlated("2: r0 = r2") +__xlated("3: exit") +__success +__naked void kfunc_bpf_cast_to_kern_ctx(void) +{ + asm volatile ( + "r2 = 1;" + "*(u64 *)(r10 - 32) = r2;" + "call %[bpf_cast_to_kern_ctx];" + "r2 = *(u64 *)(r10 - 32);" + "r0 = r2;" + "exit;" + : + : __imm(bpf_cast_to_kern_ctx) + : __clobber_all); +} + +SEC("raw_tp") +__xlated("3: r3 = 1") +/* bpf_rdonly_cast is replaced by a single assignment */ +__xlated("4: r0 = r1") +__xlated("5: r0 = r3") +void kfunc_bpf_rdonly_cast(void) +{ + asm volatile ( + "r2 = %[btf_id];" + "r3 = 1;" + "*(u64 *)(r10 - 32) = r3;" + "call %[bpf_rdonly_cast];" + "r3 = *(u64 *)(r10 - 32);" + "r0 = r3;" + : + : __imm(bpf_rdonly_cast), + [btf_id]"r"(bpf_core_type_id_kernel(union bpf_attr)) + : __clobber_common); +} + +/* BTF FUNC records are not generated for kfuncs referenced + * from inline assembly. These records are necessary for + * libbpf to link the program. The function below is a hack + * to ensure that BTF FUNC records are generated. + */ +void kfunc_root(void) +{ + bpf_cast_to_kern_ctx(0); + bpf_rdonly_cast(0, 0); +} + char _license[] SEC("license") = "GPL"; From 7559a7a84ef83a2dd86caf623430b8d834843cec Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 28 Jul 2024 19:46:12 +0800 Subject: [PATCH 1012/1052] selftests/bpf: Add testcase for updating attached freplace prog to prog_array map Add a selftest to confirm the issue, which gets -EINVAL when update attached freplace prog to prog_array map, has been fixed. cd tools/testing/selftests/bpf; ./test_progs -t tailcalls 328/25 tailcalls/tailcall_freplace:OK 328 tailcalls:OK Summary: 1/25 PASSED, 0 SKIPPED, 0 FAILED Acked-by: Yonghong Song Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240728114612.48486-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tailcalls.c | 65 ++++++++++++++++++- .../selftests/bpf/progs/tailcall_freplace.c | 23 +++++++ .../testing/selftests/bpf/progs/tc_bpf2bpf.c | 22 +++++++ 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_freplace.c create mode 100644 tools/testing/selftests/bpf/progs/tc_bpf2bpf.c diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index e01fabb8cc41..21c5a37846ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -5,7 +5,8 @@ #include "tailcall_poke.skel.h" #include "tailcall_bpf2bpf_hierarchy2.skel.h" #include "tailcall_bpf2bpf_hierarchy3.skel.h" - +#include "tailcall_freplace.skel.h" +#include "tc_bpf2bpf.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -1495,6 +1496,66 @@ static void test_tailcall_bpf2bpf_hierarchy_3(void) RUN_TESTS(tailcall_bpf2bpf_hierarchy3); } +/* test_tailcall_freplace checks that the attached freplace prog is OK to + * update the prog_array map. + */ +static void test_tailcall_freplace(void) +{ + struct tailcall_freplace *freplace_skel = NULL; + struct bpf_link *freplace_link = NULL; + struct bpf_program *freplace_prog; + struct tc_bpf2bpf *tc_skel = NULL; + int prog_fd, map_fd; + char buff[128] = {}; + int err, key; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = buff, + .data_size_in = sizeof(buff), + .repeat = 1, + ); + + freplace_skel = tailcall_freplace__open(); + if (!ASSERT_OK_PTR(freplace_skel, "tailcall_freplace__open")) + return; + + tc_skel = tc_bpf2bpf__open_and_load(); + if (!ASSERT_OK_PTR(tc_skel, "tc_bpf2bpf__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(tc_skel->progs.entry_tc); + freplace_prog = freplace_skel->progs.entry_freplace; + err = bpf_program__set_attach_target(freplace_prog, prog_fd, "subprog"); + if (!ASSERT_OK(err, "set_attach_target")) + goto out; + + err = tailcall_freplace__load(freplace_skel); + if (!ASSERT_OK(err, "tailcall_freplace__load")) + goto out; + + freplace_link = bpf_program__attach_freplace(freplace_prog, prog_fd, + "subprog"); + if (!ASSERT_OK_PTR(freplace_link, "attach_freplace")) + goto out; + + map_fd = bpf_map__fd(freplace_skel->maps.jmp_table); + prog_fd = bpf_program__fd(freplace_prog); + key = 0; + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + prog_fd = bpf_program__fd(tc_skel->progs.entry_tc); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 34, "test_run retval"); + +out: + bpf_link__destroy(freplace_link); + tc_bpf2bpf__destroy(tc_skel); + tailcall_freplace__destroy(freplace_skel); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1543,4 +1604,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_hierarchy_fentry_entry(); test_tailcall_bpf2bpf_hierarchy_2(); test_tailcall_bpf2bpf_hierarchy_3(); + if (test__start_subtest("tailcall_freplace")) + test_tailcall_freplace(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_freplace.c b/tools/testing/selftests/bpf/progs/tailcall_freplace.c new file mode 100644 index 000000000000..6713b809df44 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_freplace.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int count = 0; + +SEC("freplace") +int entry_freplace(struct __sk_buff *skb) +{ + count++; + bpf_tail_call_static(skb, &jmp_table, 0); + return count; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c new file mode 100644 index 000000000000..8a0632c37839 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +__noinline +int subprog(struct __sk_buff *skb) +{ + int ret = 1; + + __sink(ret); + return ret; +} + +SEC("tc") +int entry_tc(struct __sk_buff *skb) +{ + return subprog(skb); +} + +char __license[] SEC("license") = "GPL"; From ec1f77f6557b46639fa47c6980ef9d38995c1e05 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 01:06:42 -0700 Subject: [PATCH 1013/1052] selftests/bpf: test_loader.c:get_current_arch() should not return 0 At the moment, when test_loader.c:get_current_arch() can't determine the arch, it returns 0. The arch check in run_subtest() looks as follows: if ((get_current_arch() & spec->arch_mask) == 0) { test__skip(); return; } Which means that all test_loader based tests would be skipped if arch could not be determined. get_current_arch() recognizes x86_64, arm64 and riscv64. Which means that CI skips test_loader tests for s390. Fix this by making sure that get_current_arch() always returns non-zero value. In combination with default spec->arch_mask == -1 this should cover all possibilities. Fixes: f406026fefa7 ("selftests/bpf: by default use arch mask allowing all archs") Fixes: 7d743e4c759c ("selftests/bpf: __jited test tag to check disassembly after jit") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823080644.263943-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 2ca9b73e5a6b..4223cffc090e 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -336,9 +336,10 @@ static const char *skip_dynamic_pfx(const char *s, const char *pfx) } enum arch { - ARCH_X86_64 = 0x1, - ARCH_ARM64 = 0x2, - ARCH_RISCV64 = 0x4, + ARCH_UNKNOWN = 0x1, + ARCH_X86_64 = 0x2, + ARCH_ARM64 = 0x4, + ARCH_RISCV64 = 0x8, }; static int get_current_arch(void) @@ -350,7 +351,7 @@ static int get_current_arch(void) #elif defined(__riscv) && __riscv_xlen == 64 return ARCH_RISCV64; #endif - return 0; + return ARCH_UNKNOWN; } /* Uses btf_decl_tag attributes to describe the expected test From c52a1e6eb74ffe4f217e9b53ed75440a45b08c4c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 01:06:43 -0700 Subject: [PATCH 1014/1052] selftests/bpf: match both retq/rethunk in verifier_tailcall_jit Depending on kernel parameters, x86 jit generates either retq or jump to rethunk for 'exit' instruction. The difference could be seen when kernel is booted with and without mitigations=off parameter. Relax the verifier_tailcall_jit test case to match both variants. Fixes: e5bdd6a8be78 ("selftests/bpf: validate jit behaviour for tail calls") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823080644.263943-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c index 06d327cf1e1f..8d60c634a114 100644 --- a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c +++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c @@ -59,7 +59,7 @@ __jited(" movq -0x10(%rbp), %rax") __jited(" callq 0x{{.*}}") /* call to sub() */ __jited(" xorl %eax, %eax") __jited(" leave") -__jited(" retq") +__jited(" {{(retq|jmp 0x)}}") /* return or jump to rethunk */ __jited("...") /* subprogram entry for sub(), regular function prologue */ __jited(" endbr64") @@ -89,7 +89,7 @@ __jited(" popq %rax") __jited(" popq %rax") __jited(" jmp {{.*}}") /* jump to tail call tgt */ __jited("L0: leave") -__jited(" retq") +__jited(" {{(retq|jmp 0x)}}") /* return or jump to rethunk */ SEC("tc") __naked int main(void) { From 21a56fc503faae7589167fcd2771ab6dce36ab39 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 01:06:44 -0700 Subject: [PATCH 1015/1052] selftests/bpf: #define LOCAL_LABEL_LEN for jit_disasm_helpers.c Extract local label length as a #define directive and elaborate why 'i % MAX_LOCAL_LABELS' expression is needed for local labels array initialization. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823080644.263943-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/jit_disasm_helpers.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c index 1b0f1fd267c0..febd6b12e372 100644 --- a/tools/testing/selftests/bpf/jit_disasm_helpers.c +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -16,6 +16,11 @@ */ #define MAX_LOCAL_LABELS 32 +/* Local labels are encoded as 'L42', this requires 4 bytes of storage: + * 3 characters + zero byte + */ +#define LOCAL_LABEL_LEN 4 + static bool llvm_initialized; struct local_labels { @@ -23,7 +28,7 @@ struct local_labels { __u32 prog_len; __u32 cnt; __u32 pcs[MAX_LOCAL_LABELS]; - char names[MAX_LOCAL_LABELS][4]; + char names[MAX_LOCAL_LABELS][LOCAL_LABEL_LEN]; }; static const char *lookup_symbol(void *data, uint64_t ref_value, uint64_t *ref_type, @@ -118,8 +123,14 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) } qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); for (i = 0; i < labels.cnt; ++i) - /* use (i % 100) to avoid format truncation warning */ - snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % 100); + /* gcc is unable to infer upper bound for labels.cnt and assumes + * it to be U32_MAX. U32_MAX takes 10 decimal digits. + * snprintf below prints into labels.names[*], + * which has space only for two digits and a letter. + * To avoid truncation warning use (i % MAX_LOCAL_LABELS), + * which informs gcc about printed value upper bound. + */ + snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % MAX_LOCAL_LABELS); /* now print with labels */ labels.print_phase = true; From 4e9e07603ecdfd0816838132f354239771c51edd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Aug 2024 16:03:19 -0700 Subject: [PATCH 1016/1052] selftests/bpf: make use of PROCMAP_QUERY ioctl if available Instead of parsing text-based /proc//maps file, try to use PROCMAP_QUERY ioctl() to simplify and speed up data fetching. This logic is used to do uprobe file offset calculation, so any bugs in this logic would manifest as failing uprobe BPF selftests. This also serves as a simple demonstration of one of the intended uses. Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240806230319.869734-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_progs.c | 3 + tools/testing/selftests/bpf/test_progs.h | 2 + tools/testing/selftests/bpf/trace_helpers.c | 104 +++++++++++++++++--- 3 files changed, 94 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index f45b06791444..83f390a31681 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -35,6 +35,8 @@ __weak void backtrace_symbols_fd(void *const *buffer, int size, int fd) dprintf(fd, "\n"); } +int env_verbosity = 0; + static bool verbose(void) { return env.verbosity > VERBOSE_NONE; @@ -973,6 +975,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } } + env_verbosity = env->verbosity; if (verbose()) { if (setenv("SELFTESTS_VERBOSE", "1", 1) == -1) { diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 3ad131de14c6..7767d9a825ae 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -96,6 +96,8 @@ struct test_state { FILE *stdout_saved; }; +extern int env_verbosity; + struct test_env { struct test_selector test_selector; struct test_selector subtest_selector; diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 465d196c7165..1bfd881c0e07 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include "trace_helpers.h" #include @@ -244,29 +246,91 @@ int kallsyms_find(const char *sym, unsigned long long *addr) return err; } +#ifdef PROCMAP_QUERY +int env_verbosity __weak = 0; + +int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) +{ + char path_buf[PATH_MAX], build_id_buf[20]; + struct procmap_query q; + int err; + + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_flags = query_flags; + q.query_addr = (__u64)addr; + q.vma_name_addr = (__u64)path_buf; + q.vma_name_size = sizeof(path_buf); + q.build_id_addr = (__u64)build_id_buf; + q.build_id_size = sizeof(build_id_buf); + + err = ioctl(fd, PROCMAP_QUERY, &q); + if (err < 0) { + err = -errno; + if (err == -ENOTTY) + return -EOPNOTSUPP; /* ioctl() not implemented yet */ + if (err == -ENOENT) + return -ESRCH; /* vma not found */ + return err; + } + + if (env_verbosity >= 1) { + printf("VMA FOUND (addr %08lx): %08lx-%08lx %c%c%c%c %08lx %02x:%02x %ld %s (build ID: %s, %d bytes)\n", + (long)addr, (long)q.vma_start, (long)q.vma_end, + (q.vma_flags & PROCMAP_QUERY_VMA_READABLE) ? 'r' : '-', + (q.vma_flags & PROCMAP_QUERY_VMA_WRITABLE) ? 'w' : '-', + (q.vma_flags & PROCMAP_QUERY_VMA_EXECUTABLE) ? 'x' : '-', + (q.vma_flags & PROCMAP_QUERY_VMA_SHARED) ? 's' : 'p', + (long)q.vma_offset, q.dev_major, q.dev_minor, (long)q.inode, + q.vma_name_size ? path_buf : "", + q.build_id_size ? "YES" : "NO", + q.build_id_size); + } + + *start = q.vma_start; + *offset = q.vma_offset; + *flags = q.vma_flags; + return 0; +} +#else +int procmap_query(int fd, const void *addr, size_t *start, size_t *offset, int *flags) +{ + return -EOPNOTSUPP; +} +#endif + ssize_t get_uprobe_offset(const void *addr) { - size_t start, end, base; - char buf[256]; - bool found = false; + size_t start, base, end; FILE *f; + char buf[256]; + int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; - while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { - if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { - found = true; - break; + /* requested executable VMA only */ + err = procmap_query(fileno(f), addr, PROCMAP_QUERY_VMA_EXECUTABLE, &start, &base, &flags); + if (err == -EOPNOTSUPP) { + bool found = false; + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { + if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { + found = true; + break; + } } + if (!found) { + fclose(f); + return -ESRCH; + } + } else if (err) { + fclose(f); + return err; } - fclose(f); - if (!found) - return -ESRCH; - #if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 #define OP_RT_RA_MASK 0xffff0000UL @@ -307,15 +371,25 @@ ssize_t get_rel_offset(uintptr_t addr) size_t start, end, offset; char buf[256]; FILE *f; + int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; - while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { - if (addr >= start && addr < end) { - fclose(f); - return (size_t)addr - start + offset; + err = procmap_query(fileno(f), (const void *)addr, 0, &start, &offset, &flags); + if (err == 0) { + fclose(f); + return (size_t)addr - start + offset; + } else if (err != -EOPNOTSUPP) { + fclose(f); + return err; + } else if (err) { + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { + if (addr >= start && addr < end) { + fclose(f); + return (size_t)addr - start + offset; + } } } From f727b13dbea16c5e117e263aa8aea59d632d5660 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 5 Aug 2024 21:29:35 -0700 Subject: [PATCH 1017/1052] selftests/bpf: add multi-uprobe benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multi-uprobe and multi-uretprobe benchmarks to bench tool. Multi- and classic uprobes/uretprobes have different low-level triggering code paths, so it's sometimes important to be able to benchmark both flavors of uprobes/uretprobes. Sample examples from my dev machine below. Single-threaded peformance almost doesn't differ, but with more parallel CPUs triggering the same uprobe/uretprobe the difference grows. This might be due to [0], but given the code is slightly different, there could be other sources of slowdown. Note, all these numbers will change due to ongoing work to improve uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this is useful for measurements and debugging nevertheless. \#!/bin/bash set -eufo pipefail for p in 1 8 16 32; do for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1) total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-) percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut -d'/' -f1) printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu" done echo done uprobe-nop ( 1 cpus): 1.020 ± 0.005M/s ( 1.020M/s/cpu) uretprobe-nop ( 1 cpus): 0.515 ± 0.009M/s ( 0.515M/s/cpu) uprobe-multi-nop ( 1 cpus): 1.036 ± 0.004M/s ( 1.036M/s/cpu) uretprobe-multi-nop ( 1 cpus): 0.512 ± 0.005M/s ( 0.512M/s/cpu) uprobe-nop ( 8 cpus): 3.481 ± 0.030M/s ( 0.435M/s/cpu) uretprobe-nop ( 8 cpus): 2.222 ± 0.008M/s ( 0.278M/s/cpu) uprobe-multi-nop ( 8 cpus): 3.769 ± 0.094M/s ( 0.471M/s/cpu) uretprobe-multi-nop ( 8 cpus): 2.482 ± 0.007M/s ( 0.310M/s/cpu) uprobe-nop (16 cpus): 2.968 ± 0.011M/s ( 0.185M/s/cpu) uretprobe-nop (16 cpus): 1.870 ± 0.002M/s ( 0.117M/s/cpu) uprobe-multi-nop (16 cpus): 3.541 ± 0.037M/s ( 0.221M/s/cpu) uretprobe-multi-nop (16 cpus): 2.123 ± 0.026M/s ( 0.133M/s/cpu) uprobe-nop (32 cpus): 2.524 ± 0.026M/s ( 0.079M/s/cpu) uretprobe-nop (32 cpus): 1.572 ± 0.003M/s ( 0.049M/s/cpu) uprobe-multi-nop (32 cpus): 2.717 ± 0.003M/s ( 0.085M/s/cpu) uretprobe-multi-nop (32 cpus): 1.687 ± 0.007M/s ( 0.053M/s/cpu) [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/ [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240806042935.3867862-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 12 +++ .../selftests/bpf/benchs/bench_trigger.c | 81 +++++++++++++++---- .../selftests/bpf/progs/trigger_bench.c | 7 ++ 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 90dc3aca32bd..1bd403a5ef7b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -520,6 +520,12 @@ extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; +extern const struct bench bench_trig_uprobe_multi_nop; +extern const struct bench bench_trig_uretprobe_multi_nop; +extern const struct bench bench_trig_uprobe_multi_push; +extern const struct bench bench_trig_uretprobe_multi_push; +extern const struct bench bench_trig_uprobe_multi_ret; +extern const struct bench bench_trig_uretprobe_multi_ret; extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; @@ -574,6 +580,12 @@ static const struct bench *benchs[] = { &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + &bench_trig_uprobe_multi_nop, + &bench_trig_uretprobe_multi_nop, + &bench_trig_uprobe_multi_push, + &bench_trig_uretprobe_multi_push, + &bench_trig_uprobe_multi_ret, + &bench_trig_uretprobe_multi_ret, /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 4b05539f167d..a220545a3238 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -332,7 +332,7 @@ static void *uprobe_producer_ret(void *input) return NULL; } -static void usetup(bool use_retprobe, void *target_addr) +static void usetup(bool use_retprobe, bool use_multi, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; @@ -346,7 +346,10 @@ static void usetup(bool use_retprobe, void *target_addr) exit(1); } - bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + if (use_multi) + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe_multi, true); + else + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); err = trigger_bench__load(ctx.skel); if (err) { @@ -355,16 +358,28 @@ static void usetup(bool use_retprobe, void *target_addr) } uprobe_offset = get_uprobe_offset(target_addr); - link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, - use_retprobe, - -1 /* all PIDs */, - "/proc/self/exe", - uprobe_offset); + if (use_multi) { + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .retprobe = use_retprobe, + .cnt = 1, + .offsets = &uprobe_offset, + ); + link = bpf_program__attach_uprobe_multi( + ctx.skel->progs.bench_trigger_uprobe_multi, + -1 /* all PIDs */, "/proc/self/exe", NULL, &opts); + ctx.skel->links.bench_trigger_uprobe_multi = link; + } else { + link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, + use_retprobe, + -1 /* all PIDs */, + "/proc/self/exe", + uprobe_offset); + ctx.skel->links.bench_trigger_uprobe = link; + } if (!link) { - fprintf(stderr, "failed to attach uprobe!\n"); + fprintf(stderr, "failed to attach %s!\n", use_multi ? "multi-uprobe" : "uprobe"); exit(1); } - ctx.skel->links.bench_trigger_uprobe = link; } static void usermode_count_setup(void) @@ -374,32 +389,62 @@ static void usermode_count_setup(void) static void uprobe_nop_setup(void) { - usetup(false, &uprobe_target_nop); + usetup(false, false /* !use_multi */, &uprobe_target_nop); } static void uretprobe_nop_setup(void) { - usetup(true, &uprobe_target_nop); + usetup(true, false /* !use_multi */, &uprobe_target_nop); } static void uprobe_push_setup(void) { - usetup(false, &uprobe_target_push); + usetup(false, false /* !use_multi */, &uprobe_target_push); } static void uretprobe_push_setup(void) { - usetup(true, &uprobe_target_push); + usetup(true, false /* !use_multi */, &uprobe_target_push); } static void uprobe_ret_setup(void) { - usetup(false, &uprobe_target_ret); + usetup(false, false /* !use_multi */, &uprobe_target_ret); } static void uretprobe_ret_setup(void) { - usetup(true, &uprobe_target_ret); + usetup(true, false /* !use_multi */, &uprobe_target_ret); +} + +static void uprobe_multi_nop_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_nop); +} + +static void uretprobe_multi_nop_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_nop); +} + +static void uprobe_multi_push_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_push); +} + +static void uretprobe_multi_push_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_push); +} + +static void uprobe_multi_ret_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_ret); +} + +static void uretprobe_multi_ret_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_ret); } const struct bench bench_trig_syscall_count = { @@ -454,3 +499,9 @@ BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); +BENCH_TRIG_USERMODE(uprobe_multi_nop, nop, "uprobe-multi-nop"); +BENCH_TRIG_USERMODE(uprobe_multi_push, push, "uprobe-multi-push"); +BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret"); +BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop"); +BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push"); +BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret"); diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 2619ed193c65..044a6d78923e 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -32,6 +32,13 @@ int bench_trigger_uprobe(void *ctx) return 0; } +SEC("?uprobe.multi") +int bench_trigger_uprobe_multi(void *ctx) +{ + inc_counter(); + return 0; +} + const volatile int batch_iters = 0; SEC("?raw_tp") From c5ef53420f46c9ca6badca4f4cabacd76de8091e Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 13 Aug 2024 21:24:20 +0000 Subject: [PATCH 1018/1052] bpf: Let callers of btf_parse_kptr() track life cycle of prog btf btf_parse_kptr() and btf_record_free() do btf_get() and btf_put() respectively when working on btf_record in program and map if there are kptr fields. If the kptr is from program BTF, since both callers has already tracked the life cycle of program BTF, it is safe to remove the btf_get() and btf_put(). This change prevents memory leak of program BTF later when we start searching for kptr fields when building btf_record for program. It can happen when the btf fd is closed. The btf_put() corresponding to the btf_get() in btf_parse_kptr() was supposed to be called by btf_record_free() in btf_free_struct_meta_tab() in btf_free(). However, it will never happen since the invocation of btf_free() depends on the refcount of the btf to become 0 in the first place. Acked-by: Martin KaFai Lau Acked-by: Hou Tao Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-2-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 +- kernel/bpf/syscall.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5de424d3a795..a4c6538b0ee7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3754,6 +3754,7 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, return -EINVAL; } +/* Callers have to ensure the life cycle of btf if it is program BTF */ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { @@ -3782,7 +3783,6 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, field->kptr.dtor = NULL; id = info->kptr.type_id; kptr_btf = (struct btf *)btf; - btf_get(kptr_btf); goto found_dtor; } if (id < 0) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bf6c5f685ea2..fc62f5c4faf9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -550,7 +550,8 @@ void btf_record_free(struct btf_record *rec) case BPF_KPTR_PERCPU: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); - btf_put(rec->fields[i].kptr.btf); + if (btf_is_kernel(rec->fields[i].kptr.btf)) + btf_put(rec->fields[i].kptr.btf); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: @@ -596,7 +597,8 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: - btf_get(fields[i].kptr.btf); + if (btf_is_kernel(fields[i].kptr.btf)) + btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; goto free; From 7a851ecb180686f9e50110247b0fcd537e772e63 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 13 Aug 2024 21:24:21 +0000 Subject: [PATCH 1019/1052] bpf: Search for kptrs in prog BTF structs Currently btf_parse_fields is used in two places to create struct btf_record's for structs: when looking at mapval type, and when looking at any struct in program BTF. The former looks for kptr fields while the latter does not. This patch modifies the btf_parse_fields call made when looking at prog BTF struct types to search for kptrs as well. Before this series there was no reason to search for kptrs in non-mapval types: a referenced kptr needs some owner to guarantee resource cleanup, and map values were the only owner that supported this. If a struct with a kptr field were to have some non-kptr-aware owner, the kptr field might not be properly cleaned up and result in resources leaking. Only searching for kptr fields in mapval was a simple way to avoid this problem. In practice, though, searching for BPF_KPTR when populating struct_meta_tab does not expose us to this risk, as struct_meta_tab is only accessed through btf_find_struct_meta helper, and that helper is only called in contexts where recognizing the kptr field is safe: * PTR_TO_BTF_ID reg w/ MEM_ALLOC flag * Such a reg is a local kptr and must be free'd via bpf_obj_drop, which will correctly handle kptr field * When handling specific kfuncs which either expect MEM_ALLOC input or return MEM_ALLOC output (obj_{new,drop}, percpu_obj_{new,drop}, list+rbtree funcs, refcount_acquire) * Will correctly handle kptr field for same reasons as above * When looking at kptr pointee type * Called by functions which implement "correct kptr resource handling" * In btf_check_and_fixup_fields * Helper that ensures no ownership loops for lists and rbtrees, doesn't care about kptr field existence So we should be able to find BPF_KPTR fields in all prog BTF structs without leaking resources. Further patches in the series will build on this change to support kptr_xchg into non-mapval local kptr. Without this change there would be no kptr field found in such a type. Acked-by: Martin KaFai Lau Acked-by: Hou Tao Signed-off-by: Dave Marchevsky Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-3-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 72 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a4c6538b0ee7..edad152cee8e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5512,36 +5512,70 @@ static const char *alloc_obj_fields[] = { static struct btf_struct_metas * btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) { - union { - struct btf_id_set set; - struct { - u32 _cnt; - u32 _ids[ARRAY_SIZE(alloc_obj_fields)]; - } _arr; - } aof; struct btf_struct_metas *tab = NULL; + struct btf_id_set *aof; int i, n, id, ret; BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0); BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32)); - memset(&aof, 0, sizeof(aof)); + aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN); + if (!aof) + return ERR_PTR(-ENOMEM); + aof->cnt = 0; + for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) { /* Try to find whether this special type exists in user BTF, and * if so remember its ID so we can easily find it among members * of structs that we iterate in the next loop. */ + struct btf_id_set *new_aof; + id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT); if (id < 0) continue; - aof.set.ids[aof.set.cnt++] = id; + + new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + GFP_KERNEL | __GFP_NOWARN); + if (!new_aof) { + ret = -ENOMEM; + goto free_aof; + } + aof = new_aof; + aof->ids[aof->cnt++] = id; } - if (!aof.set.cnt) - return NULL; - sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL); - n = btf_nr_types(btf); + for (i = 1; i < n; i++) { + /* Try to find if there are kptrs in user BTF and remember their ID */ + struct btf_id_set *new_aof; + struct btf_field_info tmp; + const struct btf_type *t; + + t = btf_type_by_id(btf, i); + if (!t) { + ret = -EINVAL; + goto free_aof; + } + + ret = btf_find_kptr(btf, t, 0, 0, &tmp); + if (ret != BTF_FIELD_FOUND) + continue; + + new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + GFP_KERNEL | __GFP_NOWARN); + if (!new_aof) { + ret = -ENOMEM; + goto free_aof; + } + aof = new_aof; + aof->ids[aof->cnt++] = i; + } + + if (!aof->cnt) + return NULL; + sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL); + for (i = 1; i < n; i++) { struct btf_struct_metas *new_tab; const struct btf_member *member; @@ -5551,17 +5585,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) int j, tab_cnt; t = btf_type_by_id(btf, i); - if (!t) { - ret = -EINVAL; - goto free; - } if (!__btf_type_is_struct(t)) continue; cond_resched(); for_each_member(j, t, member) { - if (btf_id_set_contains(&aof.set, member->type)) + if (btf_id_set_contains(aof, member->type)) goto parse; } continue; @@ -5580,7 +5610,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | - BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size); + BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | + BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ if (IS_ERR_OR_NULL(record)) { ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; @@ -5589,9 +5620,12 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type->record = record; tab->cnt++; } + kfree(aof); return tab; free: btf_struct_metas_free(tab); +free_aof: + kfree(aof); return ERR_PTR(ret); } From d59232afb0344e33e9399f308d9b4a03876e7676 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 13 Aug 2024 21:24:22 +0000 Subject: [PATCH 1020/1052] bpf: Rename ARG_PTR_TO_KPTR -> ARG_KPTR_XCHG_DEST ARG_PTR_TO_KPTR is currently only used by the bpf_kptr_xchg helper. Although it limits reg types for that helper's first arg to PTR_TO_MAP_VALUE, any arbitrary mapval won't do: further custom verification logic ensures that the mapval reg being xchgd-into is pointing to a kptr field. If this is not the case, it's not safe to xchg into that reg's pointee. Let's rename the bpf_arg_type to more accurately describe the fairly specific expectations that this arg type encodes. This is a nonfunctional change. Acked-by: Martin KaFai Lau Signed-off-by: Dave Marchevsky Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-4-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 00dc4dd28cbd..dc63083f76b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -744,7 +744,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ - ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ + ARG_KPTR_XCHG_DEST, /* pointer to destination that kptrs are bpf_kptr_xchg'd into */ ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4105b4af6e03..bf55b81b3e06 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1636,7 +1636,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = BPF_PTR_POISON, - .arg1_type = ARG_PTR_TO_KPTR, + .arg1_type = ARG_KPTR_XCHG_DEST, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, .arg2_btf_id = BPF_PTR_POISON, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 148536446457..5fe2a8978bb8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8412,7 +8412,7 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_xchg_dest_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, @@ -8444,7 +8444,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, - [ARG_PTR_TO_KPTR] = &kptr_types, + [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types, [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; @@ -9044,7 +9044,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; break; } - case ARG_PTR_TO_KPTR: + case ARG_KPTR_XCHG_DEST: err = process_kptr_func(env, regno, meta); if (err) return err; From b0966c724584a5a9fd7fb529de19807c31f27a45 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 13 Aug 2024 21:24:23 +0000 Subject: [PATCH 1021/1052] bpf: Support bpf_kptr_xchg into local kptr Currently, users can only stash kptr into map values with bpf_kptr_xchg(). This patch further supports stashing kptr into local kptr by adding local kptr as a valid destination type. When stashing into local kptr, btf_record in program BTF is used instead of btf_record in map to search for the btf_field of the local kptr. The local kptr specific checks in check_reg_type() only apply when the source argument of bpf_kptr_xchg() is local kptr. Therefore, we make the scope of the check explicit as the destination now can also be local kptr. Acked-by: Martin KaFai Lau Signed-off-by: Dave Marchevsky Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-5-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++---- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++------------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 35bcf52dbc65..e2629457d72d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5519,11 +5519,12 @@ union bpf_attr { * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * - * void *bpf_kptr_xchg(void *map_value, void *ptr) + * void *bpf_kptr_xchg(void *dst, void *ptr) * Description - * Exchange kptr at pointer *map_value* with *ptr*, and return the - * old value. *ptr* can be NULL, otherwise it must be a referenced - * pointer which will be released when this helper is called. + * Exchange kptr at pointer *dst* with *ptr*, and return the old value. + * *dst* can be map value or local kptr. *ptr* can be NULL, otherwise + * it must be a referenced pointer which will be released when this helper + * is called. * Return * The old value of kptr (which can be NULL). The returned pointer * if not NULL, is a reference which must be released using its diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bf55b81b3e06..f12f075b70c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1619,9 +1619,9 @@ void bpf_wq_cancel_and_free(void *val) schedule_work(&work->delete_work); } -BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) +BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) { - unsigned long *kptr = map_value; + unsigned long *kptr = dst; /* This helper may be inlined by verifier. */ return xchg(kptr, (unsigned long)ptr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5fe2a8978bb8..33270b363689 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7803,29 +7803,38 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map *map_ptr = reg->map_ptr; struct btf_field *kptr_field; + struct bpf_map *map_ptr; + struct btf_record *rec; u32 kptr_off; + if (type_is_ptr_alloc_obj(reg->type)) { + rec = reg_btf_record(reg); + } else { /* PTR_TO_MAP_VALUE */ + map_ptr = reg->map_ptr; + if (!map_ptr->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", + map_ptr->name); + return -EINVAL; + } + rec = map_ptr->record; + meta->map_ptr = map_ptr; + } + if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d doesn't have constant offset. kptr has to be at the constant offset\n", regno); return -EINVAL; } - if (!map_ptr->btf) { - verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", - map_ptr->name); - return -EINVAL; - } - if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) { - verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + + if (!btf_record_has_field(rec, BPF_KPTR)) { + verbose(env, "R%d has no valid kptr\n", regno); return -EINVAL; } - meta->map_ptr = map_ptr; kptr_off = reg->off + reg->var_off.value; - kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR); + kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR); if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; @@ -8412,7 +8421,12 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types kptr_xchg_dest_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_xchg_dest_types = { + .types = { + PTR_TO_MAP_VALUE, + PTR_TO_BTF_ID | MEM_ALLOC + } +}; static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, @@ -8483,7 +8497,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) { + /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */ + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) { type &= ~MEM_ALLOC; type &= ~MEM_PERCPU; } @@ -8576,7 +8591,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } - if (meta->func_id == BPF_FUNC_kptr_xchg) { + /* Check if local kptr in src arg matches kptr in dst arg */ + if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) { if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } @@ -8887,7 +8903,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, meta->release_regno = regno; } - if (reg->ref_obj_id) { + if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", regno, reg->ref_obj_id, From 91c96842ab1e9159c8129ab5ddfeb7dd97bf840e Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 13 Aug 2024 21:24:24 +0000 Subject: [PATCH 1022/1052] selftests/bpf: Test bpf_kptr_xchg stashing into local kptr Test stashing both referenced kptr and local kptr into local kptrs. Then, test unstashing them. Acked-by: Martin KaFai Lau Acked-by: Hou Tao Signed-off-by: Dave Marchevsky Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-6-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/local_kptr_stash.c | 30 +++++++++++++++++-- .../selftests/bpf/progs/task_kfunc_success.c | 26 +++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c index 75043ffc5dad..b092a72b2c9d 100644 --- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c +++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c @@ -8,9 +8,12 @@ #include "../bpf_experimental.h" #include "../bpf_testmod/bpf_testmod_kfunc.h" +struct plain_local; + struct node_data { long key; long data; + struct plain_local __kptr * stashed_in_local_kptr; struct bpf_rb_node node; }; @@ -85,6 +88,7 @@ static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b) static int create_and_stash(int idx, int val) { + struct plain_local *inner_local_kptr; struct map_value *mapval; struct node_data *res; @@ -92,11 +96,25 @@ static int create_and_stash(int idx, int val) if (!mapval) return 1; + inner_local_kptr = bpf_obj_new(typeof(*inner_local_kptr)); + if (!inner_local_kptr) + return 2; + res = bpf_obj_new(typeof(*res)); - if (!res) - return 1; + if (!res) { + bpf_obj_drop(inner_local_kptr); + return 3; + } res->key = val; + inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr); + if (inner_local_kptr) { + /* Should never happen, we just obj_new'd res */ + bpf_obj_drop(inner_local_kptr); + bpf_obj_drop(res); + return 4; + } + res = bpf_kptr_xchg(&mapval->node, res); if (res) bpf_obj_drop(res); @@ -169,6 +187,7 @@ long stash_local_with_root(void *ctx) SEC("tc") long unstash_rb_node(void *ctx) { + struct plain_local *inner_local_kptr = NULL; struct map_value *mapval; struct node_data *res; long retval; @@ -180,6 +199,13 @@ long unstash_rb_node(void *ctx) res = bpf_kptr_xchg(&mapval->node, NULL); if (res) { + inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr); + if (!inner_local_kptr) { + bpf_obj_drop(res); + return 1; + } + bpf_obj_drop(inner_local_kptr); + retval = res->key; bpf_obj_drop(res); return retval; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 70df695312dc..3138bb689b0b 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -5,6 +5,7 @@ #include #include +#include "../bpf_experimental.h" #include "task_kfunc_common.h" char _license[] SEC("license") = "GPL"; @@ -143,7 +144,7 @@ SEC("tp_btf/task_newtask") int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) { struct task_struct *kptr; - struct __tasks_kfunc_map_value *v; + struct __tasks_kfunc_map_value *v, *local; long status; if (!is_test_kfunc_task()) @@ -167,6 +168,29 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) return 0; } + local = bpf_obj_new(typeof(*local)); + if (!local) { + err = 4; + bpf_task_release(kptr); + return 0; + } + + kptr = bpf_kptr_xchg(&local->task, kptr); + if (kptr) { + err = 5; + bpf_obj_drop(local); + bpf_task_release(kptr); + return 0; + } + + kptr = bpf_kptr_xchg(&local->task, NULL); + if (!kptr) { + err = 6; + bpf_obj_drop(local); + return 0; + } + + bpf_obj_drop(local); bpf_task_release(kptr); return 0; From 5772c3458bb8d17d763e0f411e1bae1bf4eda88d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 12:44:09 -0700 Subject: [PATCH 1023/1052] selftests/bpf: use simply-expanded variables for libpcap flags Save pkg-config output for libpcap as simply-expanded variables. For an obscure reason 'shell' call in LDLIBS/CFLAGS recursively expanded variables makes *.test.o files compilation non-parallel when make is executed with -j option. While at it, reuse 'pkg-config --cflags' call to define -DTRAFFIC_MONITOR=1 option, it's exit status is the same as for 'pkg-config --exists'. Fixes: f52403b6bfea ("selftests/bpf: Add traffic monitor functions.") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823194409.774815-1-eddyz87@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ec7d425c4022..c120617b64ad 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -48,9 +48,10 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic \ LDFLAGS += $(SAN_LDFLAGS) LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread -LDLIBS += $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) -CFLAGS += $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null) -CFLAGS += $(shell $(PKG_CONFIG) --exists libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") +PCAP_CFLAGS := $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") +PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) +LDLIBS += $(PCAP_LIBS) +CFLAGS += $(PCAP_CFLAGS) # The following tests perform type punning and they may break strict # aliasing rules, which are exploited by both GCC and clang by default From 65ab5ac4df012388481d0414fcac1d5ac1721fb3 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Fri, 23 Aug 2024 12:51:00 -0700 Subject: [PATCH 1024/1052] bpf: Add bpf_copy_from_user_str kfunc This adds a kfunc wrapper around strncpy_from_user, which can be called from sleepable BPF programs. This matches the non-sleepable 'bpf_probe_read_user_str' helper except it includes an additional 'flags' param, which allows consumers to clear the entire destination buffer on success or failure. Signed-off-by: Jordan Rome Link: https://lore.kernel.org/r/20240823195101.3621028-1-linux@jordanrome.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++++++ kernel/bpf/helpers.c | 42 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 ++++++++ 3 files changed, 60 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2629457d72d..c3a5728db115 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7513,4 +7513,13 @@ struct bpf_iter_num { __u64 __opaque[1]; } __attribute__((aligned(8))); +/* + * Flags to control BPF kfunc behaviour. + * - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective + * helper documentation for details.) + */ +enum bpf_kfunc_flags { + BPF_F_PAD_ZEROS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f12f075b70c5..7e7761c537f8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2962,6 +2962,47 @@ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) bpf_mem_free(&bpf_global_ma, kit->bits); } +/** + * bpf_copy_from_user_str() - Copy a string from an unsafe user address + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address, in user space. + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL-terminated string from userspace to BPF space. If user string is + * too long this will still ensure zero termination in the dst buffer unless + * buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and + * memset all of @dst on failure. + */ +__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(!dst__sz)) + return 0; + + ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst, 0, dst__sz); + + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst + ret, 0, dst__sz - ret); + else + ((char *)dst)[ret] = '\0'; + + return ret + 1; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3047,6 +3088,7 @@ BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 35bcf52dbc65..f329ee44627a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7512,4 +7512,13 @@ struct bpf_iter_num { __u64 __opaque[1]; } __attribute__((aligned(8))); +/* + * Flags to control BPF kfunc behaviour. + * - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective + * helper documentation for details.) + */ +enum bpf_kfunc_flags { + BPF_F_PAD_ZEROS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ From ddc3d98807dca05ad10f4c76d13019c428302719 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Fri, 23 Aug 2024 12:51:01 -0700 Subject: [PATCH 1025/1052] selftests/bpf: Add tests for bpf_copy_from_user_str kfunc. This adds tests for both the happy path and the error path. Signed-off-by: Jordan Rome Link: https://lore.kernel.org/r/20240823195101.3621028-2-linux@jordanrome.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/attach_probe.c | 8 ++- .../selftests/bpf/prog_tests/read_vsyscall.c | 1 + .../selftests/bpf/progs/read_vsyscall.c | 9 ++- .../selftests/bpf/progs/test_attach_probe.c | 64 ++++++++++++++++++- 4 files changed, 75 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 7175af39134f..329c7862b52d 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -283,9 +283,11 @@ static void test_uprobe_sleepable(struct test_attach_probe *skel) trigger_func3(); ASSERT_EQ(skel->bss->uprobe_byname3_sleepable_res, 9, "check_uprobe_byname3_sleepable_res"); - ASSERT_EQ(skel->bss->uprobe_byname3_res, 10, "check_uprobe_byname3_res"); - ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 11, "check_uretprobe_byname3_sleepable_res"); - ASSERT_EQ(skel->bss->uretprobe_byname3_res, 12, "check_uretprobe_byname3_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_str_sleepable_res, 10, "check_uprobe_byname3_str_sleepable_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_res, 11, "check_uprobe_byname3_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 12, "check_uretprobe_byname3_sleepable_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_str_sleepable_res, 13, "check_uretprobe_byname3_str_sleepable_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_res, 14, "check_uretprobe_byname3_res"); } void test_attach_probe(void) diff --git a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c index 3405923fe4e6..c7b9ba8b1d06 100644 --- a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c +++ b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c @@ -23,6 +23,7 @@ struct read_ret_desc { { .name = "probe_read_user_str", .ret = -EFAULT }, { .name = "copy_from_user", .ret = -EFAULT }, { .name = "copy_from_user_task", .ret = -EFAULT }, + { .name = "copy_from_user_str", .ret = -EFAULT }, }; void test_read_vsyscall(void) diff --git a/tools/testing/selftests/bpf/progs/read_vsyscall.c b/tools/testing/selftests/bpf/progs/read_vsyscall.c index 986f96687ae1..39ebef430059 100644 --- a/tools/testing/selftests/bpf/progs/read_vsyscall.c +++ b/tools/testing/selftests/bpf/progs/read_vsyscall.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2024. Huawei Technologies Co., Ltd */ +#include "vmlinux.h" #include #include @@ -7,10 +8,15 @@ int target_pid = 0; void *user_ptr = 0; -int read_ret[8]; +int read_ret[9]; char _license[] SEC("license") = "GPL"; +/* + * This is the only kfunc, the others are helpers + */ +int bpf_copy_from_user_str(void *dst, u32, const void *, u64) __weak __ksym; + SEC("fentry/" SYS_PREFIX "sys_nanosleep") int do_probe_read(void *ctx) { @@ -40,6 +46,7 @@ int do_copy_from_user(void *ctx) read_ret[6] = bpf_copy_from_user(buf, sizeof(buf), user_ptr); read_ret[7] = bpf_copy_from_user_task(buf, sizeof(buf), user_ptr, bpf_get_current_task_btf(), 0); + read_ret[8] = bpf_copy_from_user_str((char *)buf, sizeof(buf), user_ptr, 0); return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index 68466a6ad18c..fb79e6cab932 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -5,8 +5,10 @@ #include #include #include +#include #include "bpf_misc.h" +u32 dynamic_sz = 1; int kprobe2_res = 0; int kretprobe2_res = 0; int uprobe_byname_res = 0; @@ -14,11 +16,15 @@ int uretprobe_byname_res = 0; int uprobe_byname2_res = 0; int uretprobe_byname2_res = 0; int uprobe_byname3_sleepable_res = 0; +int uprobe_byname3_str_sleepable_res = 0; int uprobe_byname3_res = 0; int uretprobe_byname3_sleepable_res = 0; +int uretprobe_byname3_str_sleepable_res = 0; int uretprobe_byname3_res = 0; void *user_ptr = 0; +int bpf_copy_from_user_str(void *dst, u32, const void *, u64) __weak __ksym; + SEC("ksyscall/nanosleep") int BPF_KSYSCALL(handle_kprobe_auto, struct __kernel_timespec *req, struct __kernel_timespec *rem) { @@ -87,11 +93,61 @@ static __always_inline bool verify_sleepable_user_copy(void) return bpf_strncmp(data, sizeof(data), "test_data") == 0; } +static __always_inline bool verify_sleepable_user_copy_str(void) +{ + int ret; + char data_long[20]; + char data_long_pad[20]; + char data_long_err[20]; + char data_short[4]; + char data_short_pad[4]; + + ret = bpf_copy_from_user_str(data_short, sizeof(data_short), user_ptr, 0); + + if (bpf_strncmp(data_short, 4, "tes\0") != 0 || ret != 4) + return false; + + ret = bpf_copy_from_user_str(data_short_pad, sizeof(data_short_pad), user_ptr, BPF_F_PAD_ZEROS); + + if (bpf_strncmp(data_short, 4, "tes\0") != 0 || ret != 4) + return false; + + /* Make sure this passes the verifier */ + ret = bpf_copy_from_user_str(data_long, dynamic_sz & sizeof(data_long), user_ptr, 0); + + if (ret != 0) + return false; + + ret = bpf_copy_from_user_str(data_long, sizeof(data_long), user_ptr, 0); + + if (bpf_strncmp(data_long, 10, "test_data\0") != 0 || ret != 10) + return false; + + ret = bpf_copy_from_user_str(data_long_pad, sizeof(data_long_pad), user_ptr, BPF_F_PAD_ZEROS); + + if (bpf_strncmp(data_long_pad, 10, "test_data\0") != 0 || ret != 10 || data_long_pad[19] != '\0') + return false; + + ret = bpf_copy_from_user_str(data_long_err, sizeof(data_long_err), (void *)data_long, BPF_F_PAD_ZEROS); + + if (ret > 0 || data_long_err[19] != '\0') + return false; + + ret = bpf_copy_from_user_str(data_long, sizeof(data_long), user_ptr, 2); + + if (ret != -EINVAL) + return false; + + return true; +} + SEC("uprobe.s//proc/self/exe:trigger_func3") int handle_uprobe_byname3_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) uprobe_byname3_sleepable_res = 9; + if (verify_sleepable_user_copy_str()) + uprobe_byname3_str_sleepable_res = 10; return 0; } @@ -102,7 +158,7 @@ int handle_uprobe_byname3_sleepable(struct pt_regs *ctx) SEC("uprobe//proc/self/exe:trigger_func3") int handle_uprobe_byname3(struct pt_regs *ctx) { - uprobe_byname3_res = 10; + uprobe_byname3_res = 11; return 0; } @@ -110,14 +166,16 @@ SEC("uretprobe.s//proc/self/exe:trigger_func3") int handle_uretprobe_byname3_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) - uretprobe_byname3_sleepable_res = 11; + uretprobe_byname3_sleepable_res = 12; + if (verify_sleepable_user_copy_str()) + uretprobe_byname3_str_sleepable_res = 13; return 0; } SEC("uretprobe//proc/self/exe:trigger_func3") int handle_uretprobe_byname3(struct pt_regs *ctx) { - uretprobe_byname3_res = 12; + uretprobe_byname3_res = 14; return 0; } From d205d4af3a5ee840edecfa5f6b389d7d03b0786d Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Tue, 27 Aug 2024 12:30:30 +0800 Subject: [PATCH 1026/1052] samples/bpf: tracex4: Fix failed to create kretprobe 'kmem_cache_alloc_node+0x0' commit 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends") [1] swap kmem_cache_alloc_node() to kmem_cache_alloc_node_noprof(). linux/samples/bpf$ sudo ./tracex4 libbpf: prog 'bpf_prog2': failed to create kretprobe 'kmem_cache_alloc_node+0x0' perf event: No such file or directory ERROR: bpf_program__attach failed Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Link: https://github.com/torvalds/linux/commit/7bd230a26648ac68ab3731ebbc449090f0ac6a37 Link: https://lore.kernel.org/bpf/tencent_34E5BCCAC5ABF3E81222AD81B1D05F16DE06@qq.com --- samples/bpf/tracex4.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/bpf/tracex4.bpf.c b/samples/bpf/tracex4.bpf.c index ca826750901a..d786492fd926 100644 --- a/samples/bpf/tracex4.bpf.c +++ b/samples/bpf/tracex4.bpf.c @@ -33,13 +33,13 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } -SEC("kretprobe/kmem_cache_alloc_node") +SEC("kretprobe/kmem_cache_alloc_node_noprof") int bpf_prog2(struct pt_regs *ctx) { long ptr = PT_REGS_RC(ctx); long ip = 0; - /* get ip address of kmem_cache_alloc_node() caller */ + /* get ip address of kmem_cache_alloc_node_noprof() caller */ BPF_KRETPROBE_READ_RET_IP(ip, ctx); struct pair v = { From bd737fcb64856d582335a7245be60fbb591c0bfd Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 26 Aug 2024 15:16:23 +0800 Subject: [PATCH 1027/1052] bpf, arm64: Get rid of fpb bpf prog accesses stack using BPF_FP as the base address and a negative immediate number as offset. But arm64 ldr/str instructions only support non-negative immediate number as offset. To simplify the jited result, commit 5b3d19b9bd40 ("bpf, arm64: Adjust the offset of str/ldr(immediate) to positive number") introduced FPB to represent the lowest stack address that the bpf prog being jited may access, and with this address as the baseline, it converts BPF_FP plus negative immediate offset number to FPB plus non-negative immediate offset. Considering that for a given bpf prog, the jited stack space is fixed with A64_SP as the lowest address and BPF_FP as the highest address. Thus we can get rid of FPB and converts BPF_FP plus negative immediate offset to A64_SP plus non-negative immediate offset. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240826071624.350108-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 104 ++++------------------------------ 1 file changed, 11 insertions(+), 93 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 59e05a7aea56..5c9039cf261d 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -28,7 +28,6 @@ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) #define TCCNT_PTR (MAX_BPF_JIT_REG + 2) #define TMP_REG_3 (MAX_BPF_JIT_REG + 3) -#define FP_BOTTOM (MAX_BPF_JIT_REG + 4) #define ARENA_VM_START (MAX_BPF_JIT_REG + 5) #define check_imm(bits, imm) do { \ @@ -67,7 +66,6 @@ static const int bpf2a64[] = { [TCCNT_PTR] = A64_R(26), /* temporary register for blinding constants */ [BPF_REG_AX] = A64_R(9), - [FP_BOTTOM] = A64_R(27), /* callee saved register for kern_vm_start address */ [ARENA_VM_START] = A64_R(28), }; @@ -81,7 +79,6 @@ struct jit_ctx { __le32 *image; __le32 *ro_image; u32 stack_size; - int fpb_offset; u64 user_vm_start; }; @@ -330,7 +327,6 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, const u8 r8 = bpf2a64[BPF_REG_8]; const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; - const u8 fpb = bpf2a64[FP_BOTTOM]; const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const int idx0 = ctx->idx; int cur_offset; @@ -381,7 +377,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, emit(A64_PUSH(r6, r7, A64_SP), ctx); emit(A64_PUSH(r8, r9, A64_SP), ctx); prepare_bpf_tail_call_cnt(ctx); - emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx); + emit(A64_PUSH(A64_R(27), A64_R(28), A64_SP), ctx); } else { /* * Exception callback receives FP of Main Program as third @@ -427,8 +423,6 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, emit(A64_PUSH(A64_R(23), A64_R(24), A64_SP), ctx); } - emit(A64_SUB_I(1, fpb, fp, ctx->fpb_offset), ctx); - /* Stack must be multiples of 16B */ ctx->stack_size = round_up(prog->aux->stack_depth, 16); @@ -745,7 +739,6 @@ static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; const u8 ptr = bpf2a64[TCCNT_PTR]; - const u8 fpb = bpf2a64[FP_BOTTOM]; /* We're done with BPF stack */ emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); @@ -760,7 +753,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) emit(A64_POP(A64_R(23), A64_R(24), A64_SP), ctx); /* Restore x27 and x28 */ - emit(A64_POP(fpb, A64_R(28), A64_SP), ctx); + emit(A64_POP(A64_R(27), A64_R(28), A64_SP), ctx); /* Restore fs (x25) and x26 */ emit(A64_POP(ptr, fp, A64_SP), ctx); emit(A64_POP(ptr, fp, A64_SP), ctx); @@ -887,7 +880,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, const u8 tmp = bpf2a64[TMP_REG_1]; const u8 tmp2 = bpf2a64[TMP_REG_2]; const u8 fp = bpf2a64[BPF_REG_FP]; - const u8 fpb = bpf2a64[FP_BOTTOM]; const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const s16 off = insn->off; const s32 imm = insn->imm; @@ -1339,9 +1331,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit(A64_ADD(1, tmp2, src, arena_vm_base), ctx); src = tmp2; } - if (ctx->fpb_offset > 0 && src == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { - src_adj = fpb; - off_adj = off + ctx->fpb_offset; + if (src == fp) { + src_adj = A64_SP; + off_adj = off + ctx->stack_size; } else { src_adj = src; off_adj = off; @@ -1432,9 +1424,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx); dst = tmp2; } - if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { - dst_adj = fpb; - off_adj = off + ctx->fpb_offset; + if (dst == fp) { + dst_adj = A64_SP; + off_adj = off + ctx->stack_size; } else { dst_adj = dst; off_adj = off; @@ -1494,9 +1486,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx); dst = tmp2; } - if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { - dst_adj = fpb; - off_adj = off + ctx->fpb_offset; + if (dst == fp) { + dst_adj = A64_SP; + off_adj = off + ctx->stack_size; } else { dst_adj = dst; off_adj = off; @@ -1565,79 +1557,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, return 0; } -/* - * Return 0 if FP may change at runtime, otherwise find the minimum negative - * offset to FP, converts it to positive number, and align down to 8 bytes. - */ -static int find_fpb_offset(struct bpf_prog *prog) -{ - int i; - int offset = 0; - - for (i = 0; i < prog->len; i++) { - const struct bpf_insn *insn = &prog->insnsi[i]; - const u8 class = BPF_CLASS(insn->code); - const u8 mode = BPF_MODE(insn->code); - const u8 src = insn->src_reg; - const u8 dst = insn->dst_reg; - const s32 imm = insn->imm; - const s16 off = insn->off; - - switch (class) { - case BPF_STX: - case BPF_ST: - /* fp holds atomic operation result */ - if (class == BPF_STX && mode == BPF_ATOMIC && - ((imm == BPF_XCHG || - imm == (BPF_FETCH | BPF_ADD) || - imm == (BPF_FETCH | BPF_AND) || - imm == (BPF_FETCH | BPF_XOR) || - imm == (BPF_FETCH | BPF_OR)) && - src == BPF_REG_FP)) - return 0; - - if (mode == BPF_MEM && dst == BPF_REG_FP && - off < offset) - offset = insn->off; - break; - - case BPF_JMP32: - case BPF_JMP: - break; - - case BPF_LDX: - case BPF_LD: - /* fp holds load result */ - if (dst == BPF_REG_FP) - return 0; - - if (class == BPF_LDX && mode == BPF_MEM && - src == BPF_REG_FP && off < offset) - offset = off; - break; - - case BPF_ALU: - case BPF_ALU64: - default: - /* fp holds ALU result */ - if (dst == BPF_REG_FP) - return 0; - } - } - - if (offset < 0) { - /* - * safely be converted to a positive 'int', since insn->off - * is 's16' - */ - offset = -offset; - /* align down to 8 bytes */ - offset = ALIGN_DOWN(offset, 8); - } - - return offset; -} - static int build_body(struct jit_ctx *ctx, bool extra_pass) { const struct bpf_prog *prog = ctx->prog; @@ -1774,7 +1693,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto out_off; } - ctx.fpb_offset = find_fpb_offset(prog); ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); /* From 5d4fa9ec5643a5c75d3c1e6abf50fb9284caf1ff Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 26 Aug 2024 15:16:24 +0800 Subject: [PATCH 1028/1052] bpf, arm64: Avoid blindly saving/restoring all callee-saved registers The arm64 jit blindly saves/restores all callee-saved registers, making the jited result looks a bit too compliated. For example, for an empty prog, the jited result is: 0: bti jc 4: mov x9, lr 8: nop c: paciasp 10: stp fp, lr, [sp, #-16]! 14: mov fp, sp 18: stp x19, x20, [sp, #-16]! 1c: stp x21, x22, [sp, #-16]! 20: stp x26, x25, [sp, #-16]! 24: mov x26, #0 28: stp x26, x25, [sp, #-16]! 2c: mov x26, sp 30: stp x27, x28, [sp, #-16]! 34: mov x25, sp 38: bti j // tailcall target 3c: sub sp, sp, #0 40: mov x7, #0 44: add sp, sp, #0 48: ldp x27, x28, [sp], #16 4c: ldp x26, x25, [sp], #16 50: ldp x26, x25, [sp], #16 54: ldp x21, x22, [sp], #16 58: ldp x19, x20, [sp], #16 5c: ldp fp, lr, [sp], #16 60: mov x0, x7 64: autiasp 68: ret Clearly, there is no need to save/restore unused callee-saved registers. This patch does this change, making the jited image to only save/restore the callee-saved registers it uses. Now the jited result of empty prog is: 0: bti jc 4: mov x9, lr 8: nop c: paciasp 10: stp fp, lr, [sp, #-16]! 14: mov fp, sp 18: stp xzr, x26, [sp, #-16]! 1c: mov x26, sp 20: bti j // tailcall target 24: mov x7, #0 28: ldp xzr, x26, [sp], #16 2c: ldp fp, lr, [sp], #16 30: mov x0, x7 34: autiasp 38: ret Since bpf prog saves/restores its own callee-saved registers as needed, to make tailcall work correctly, the caller needs to restore its saved registers before tailcall, and the callee needs to save its callee-saved registers after tailcall. This extra restoring/saving instructions increases preformance overhead. [1] provides 2 benchmarks for tailcall scenarios. Below is the perf number measured in an arm64 KVM guest. The result indicates that the performance difference before and after the patch in typical tailcall scenarios is negligible. - Before: Performance counter stats for './test_progs -t tailcalls' (5 runs): 4313.43 msec task-clock # 0.874 CPUs utilized ( +- 0.16% ) 574 context-switches # 133.073 /sec ( +- 1.14% ) 0 cpu-migrations # 0.000 /sec 538 page-faults # 124.727 /sec ( +- 0.57% ) 10697772784 cycles # 2.480 GHz ( +- 0.22% ) (61.19%) 25511241955 instructions # 2.38 insn per cycle ( +- 0.08% ) (66.70%) 5108910557 branches # 1.184 G/sec ( +- 0.08% ) (72.38%) 2800459 branch-misses # 0.05% of all branches ( +- 0.51% ) (72.36%) TopDownL1 # 0.60 retiring ( +- 0.09% ) (66.84%) # 0.21 frontend_bound ( +- 0.15% ) (61.31%) # 0.12 bad_speculation ( +- 0.08% ) (50.11%) # 0.07 backend_bound ( +- 0.16% ) (33.30%) 8274201819 L1-dcache-loads # 1.918 G/sec ( +- 0.18% ) (33.15%) 468268 L1-dcache-load-misses # 0.01% of all L1-dcache accesses ( +- 4.69% ) (33.16%) 385383 LLC-loads # 89.345 K/sec ( +- 5.22% ) (33.16%) 38296 LLC-load-misses # 9.94% of all LL-cache accesses ( +- 42.52% ) (38.69%) 6886576501 L1-icache-loads # 1.597 G/sec ( +- 0.35% ) (38.69%) 1848585 L1-icache-load-misses # 0.03% of all L1-icache accesses ( +- 4.52% ) (44.23%) 9043645883 dTLB-loads # 2.097 G/sec ( +- 0.10% ) (44.33%) 416672 dTLB-load-misses # 0.00% of all dTLB cache accesses ( +- 5.15% ) (49.89%) 6925626111 iTLB-loads # 1.606 G/sec ( +- 0.35% ) (55.46%) 66220 iTLB-load-misses # 0.00% of all iTLB cache accesses ( +- 1.88% ) (55.50%) L1-dcache-prefetches L1-dcache-prefetch-misses 4.9372 +- 0.0526 seconds time elapsed ( +- 1.07% ) Performance counter stats for './test_progs -t flow_dissector' (5 runs): 10924.50 msec task-clock # 0.945 CPUs utilized ( +- 0.08% ) 603 context-switches # 55.197 /sec ( +- 1.13% ) 0 cpu-migrations # 0.000 /sec 566 page-faults # 51.810 /sec ( +- 0.42% ) 27381270695 cycles # 2.506 GHz ( +- 0.18% ) (60.46%) 56996583922 instructions # 2.08 insn per cycle ( +- 0.21% ) (66.11%) 10321647567 branches # 944.816 M/sec ( +- 0.17% ) (71.79%) 3347735 branch-misses # 0.03% of all branches ( +- 3.72% ) (72.15%) TopDownL1 # 0.52 retiring ( +- 0.13% ) (66.74%) # 0.27 frontend_bound ( +- 0.14% ) (61.27%) # 0.14 bad_speculation ( +- 0.19% ) (50.36%) # 0.07 backend_bound ( +- 0.42% ) (33.89%) 18740797617 L1-dcache-loads # 1.715 G/sec ( +- 0.43% ) (33.71%) 13715669 L1-dcache-load-misses # 0.07% of all L1-dcache accesses ( +- 32.85% ) (33.34%) 4087551 LLC-loads # 374.164 K/sec ( +- 29.53% ) (33.26%) 267906 LLC-load-misses # 6.55% of all LL-cache accesses ( +- 23.90% ) (38.76%) 15811864229 L1-icache-loads # 1.447 G/sec ( +- 0.12% ) (38.73%) 2976833 L1-icache-load-misses # 0.02% of all L1-icache accesses ( +- 9.73% ) (44.22%) 20138907471 dTLB-loads # 1.843 G/sec ( +- 0.18% ) (44.15%) 732850 dTLB-load-misses # 0.00% of all dTLB cache accesses ( +- 11.18% ) (49.64%) 15895726702 iTLB-loads # 1.455 G/sec ( +- 0.15% ) (55.13%) 152075 iTLB-load-misses # 0.00% of all iTLB cache accesses ( +- 4.71% ) (54.98%) L1-dcache-prefetches L1-dcache-prefetch-misses 11.5613 +- 0.0317 seconds time elapsed ( +- 0.27% ) - After: Performance counter stats for './test_progs -t tailcalls' (5 runs): 4278.78 msec task-clock # 0.871 CPUs utilized ( +- 0.15% ) 569 context-switches # 132.982 /sec ( +- 0.58% ) 0 cpu-migrations # 0.000 /sec 539 page-faults # 125.970 /sec ( +- 0.43% ) 10588986432 cycles # 2.475 GHz ( +- 0.20% ) (60.91%) 25303825043 instructions # 2.39 insn per cycle ( +- 0.08% ) (66.48%) 5110756256 branches # 1.194 G/sec ( +- 0.07% ) (72.03%) 2719569 branch-misses # 0.05% of all branches ( +- 2.42% ) (72.03%) TopDownL1 # 0.60 retiring ( +- 0.22% ) (66.31%) # 0.22 frontend_bound ( +- 0.21% ) (60.83%) # 0.12 bad_speculation ( +- 0.26% ) (50.25%) # 0.06 backend_bound ( +- 0.17% ) (33.52%) 8163648527 L1-dcache-loads # 1.908 G/sec ( +- 0.33% ) (33.52%) 694979 L1-dcache-load-misses # 0.01% of all L1-dcache accesses ( +- 30.53% ) (33.52%) 1902347 LLC-loads # 444.600 K/sec ( +- 48.84% ) (33.69%) 96677 LLC-load-misses # 5.08% of all LL-cache accesses ( +- 43.48% ) (39.30%) 6863517589 L1-icache-loads # 1.604 G/sec ( +- 0.37% ) (39.17%) 1871519 L1-icache-load-misses # 0.03% of all L1-icache accesses ( +- 6.78% ) (44.56%) 8927782813 dTLB-loads # 2.087 G/sec ( +- 0.14% ) (44.37%) 438237 dTLB-load-misses # 0.00% of all dTLB cache accesses ( +- 6.00% ) (49.75%) 6886906831 iTLB-loads # 1.610 G/sec ( +- 0.36% ) (55.08%) 67568 iTLB-load-misses # 0.00% of all iTLB cache accesses ( +- 3.27% ) (54.86%) L1-dcache-prefetches L1-dcache-prefetch-misses 4.9114 +- 0.0309 seconds time elapsed ( +- 0.63% ) Performance counter stats for './test_progs -t flow_dissector' (5 runs): 10948.40 msec task-clock # 0.942 CPUs utilized ( +- 0.05% ) 615 context-switches # 56.173 /sec ( +- 1.65% ) 1 cpu-migrations # 0.091 /sec ( +- 31.62% ) 567 page-faults # 51.788 /sec ( +- 0.44% ) 27334194328 cycles # 2.497 GHz ( +- 0.08% ) (61.05%) 56656528828 instructions # 2.07 insn per cycle ( +- 0.08% ) (66.67%) 10270389422 branches # 938.072 M/sec ( +- 0.10% ) (72.21%) 3453837 branch-misses # 0.03% of all branches ( +- 3.75% ) (72.27%) TopDownL1 # 0.52 retiring ( +- 0.16% ) (66.55%) # 0.27 frontend_bound ( +- 0.09% ) (60.91%) # 0.14 bad_speculation ( +- 0.08% ) (49.85%) # 0.07 backend_bound ( +- 0.16% ) (33.33%) 18982866028 L1-dcache-loads # 1.734 G/sec ( +- 0.24% ) (33.34%) 8802454 L1-dcache-load-misses # 0.05% of all L1-dcache accesses ( +- 52.30% ) (33.31%) 2612962 LLC-loads # 238.661 K/sec ( +- 29.78% ) (33.45%) 264107 LLC-load-misses # 10.11% of all LL-cache accesses ( +- 18.34% ) (39.07%) 15793205997 L1-icache-loads # 1.443 G/sec ( +- 0.15% ) (39.09%) 3930802 L1-icache-load-misses # 0.02% of all L1-icache accesses ( +- 3.72% ) (44.66%) 20097828496 dTLB-loads # 1.836 G/sec ( +- 0.09% ) (44.68%) 961757 dTLB-load-misses # 0.00% of all dTLB cache accesses ( +- 3.32% ) (50.15%) 15838728506 iTLB-loads # 1.447 G/sec ( +- 0.09% ) (55.62%) 167652 iTLB-load-misses # 0.00% of all iTLB cache accesses ( +- 1.28% ) (55.52%) L1-dcache-prefetches L1-dcache-prefetch-misses 11.6173 +- 0.0268 seconds time elapsed ( +- 0.23% ) [1] https://lore.kernel.org/bpf/20200724123644.5096-1-maciej.fijalkowski@intel.com/ Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240826071624.350108-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 294 +++++++++++++++++++++------------- 1 file changed, 183 insertions(+), 111 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 5c9039cf261d..8aa32cb140b9 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -76,10 +76,14 @@ struct jit_ctx { int epilogue_offset; int *offset; int exentry_idx; + int nr_used_callee_reg; + u8 used_callee_reg[8]; /* r6~r9, fp, arena_vm_start */ __le32 *image; __le32 *ro_image; u32 stack_size; u64 user_vm_start; + u64 arena_vm_start; + bool fp_used; }; struct bpf_plt { @@ -270,41 +274,141 @@ static bool is_lsi_offset(int offset, int scale) return true; } -/* generated prologue: +/* generated main prog prologue: * bti c // if CONFIG_ARM64_BTI_KERNEL * mov x9, lr * nop // POKE_OFFSET * paciasp // if CONFIG_ARM64_PTR_AUTH_KERNEL * stp x29, lr, [sp, #-16]! * mov x29, sp - * stp x19, x20, [sp, #-16]! - * stp x21, x22, [sp, #-16]! - * stp x26, x25, [sp, #-16]! - * stp x26, x25, [sp, #-16]! - * stp x27, x28, [sp, #-16]! - * mov x25, sp - * mov tcc, #0 + * stp xzr, x26, [sp, #-16]! + * mov x26, sp * // PROLOGUE_OFFSET + * // save callee-saved registers */ - static void prepare_bpf_tail_call_cnt(struct jit_ctx *ctx) { - const struct bpf_prog *prog = ctx->prog; - const bool is_main_prog = !bpf_is_subprog(prog); + const bool is_main_prog = !bpf_is_subprog(ctx->prog); const u8 ptr = bpf2a64[TCCNT_PTR]; - const u8 fp = bpf2a64[BPF_REG_FP]; - const u8 tcc = ptr; - emit(A64_PUSH(ptr, fp, A64_SP), ctx); if (is_main_prog) { /* Initialize tail_call_cnt. */ - emit(A64_MOVZ(1, tcc, 0, 0), ctx); - emit(A64_PUSH(tcc, fp, A64_SP), ctx); + emit(A64_PUSH(A64_ZR, ptr, A64_SP), ctx); emit(A64_MOV(1, ptr, A64_SP), ctx); + } else + emit(A64_PUSH(ptr, ptr, A64_SP), ctx); +} + +static void find_used_callee_regs(struct jit_ctx *ctx) +{ + int i; + const struct bpf_prog *prog = ctx->prog; + const struct bpf_insn *insn = &prog->insnsi[0]; + int reg_used = 0; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6) + reg_used |= 1; + + if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7) + reg_used |= 2; + + if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8) + reg_used |= 4; + + if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9) + reg_used |= 8; + + if (insn->dst_reg == BPF_REG_FP || insn->src_reg == BPF_REG_FP) { + ctx->fp_used = true; + reg_used |= 16; + } + } + + i = 0; + if (reg_used & 1) + ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_6]; + + if (reg_used & 2) + ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_7]; + + if (reg_used & 4) + ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_8]; + + if (reg_used & 8) + ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_9]; + + if (reg_used & 16) + ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_FP]; + + if (ctx->arena_vm_start) + ctx->used_callee_reg[i++] = bpf2a64[ARENA_VM_START]; + + ctx->nr_used_callee_reg = i; +} + +/* Save callee-saved registers */ +static void push_callee_regs(struct jit_ctx *ctx) +{ + int reg1, reg2, i; + + /* + * Program acting as exception boundary should save all ARM64 + * Callee-saved registers as the exception callback needs to recover + * all ARM64 Callee-saved registers in its epilogue. + */ + if (ctx->prog->aux->exception_boundary) { + emit(A64_PUSH(A64_R(19), A64_R(20), A64_SP), ctx); + emit(A64_PUSH(A64_R(21), A64_R(22), A64_SP), ctx); + emit(A64_PUSH(A64_R(23), A64_R(24), A64_SP), ctx); + emit(A64_PUSH(A64_R(25), A64_R(26), A64_SP), ctx); + emit(A64_PUSH(A64_R(27), A64_R(28), A64_SP), ctx); } else { - emit(A64_PUSH(ptr, fp, A64_SP), ctx); - emit(A64_NOP, ctx); - emit(A64_NOP, ctx); + find_used_callee_regs(ctx); + for (i = 0; i + 1 < ctx->nr_used_callee_reg; i += 2) { + reg1 = ctx->used_callee_reg[i]; + reg2 = ctx->used_callee_reg[i + 1]; + emit(A64_PUSH(reg1, reg2, A64_SP), ctx); + } + if (i < ctx->nr_used_callee_reg) { + reg1 = ctx->used_callee_reg[i]; + /* keep SP 16-byte aligned */ + emit(A64_PUSH(reg1, A64_ZR, A64_SP), ctx); + } + } +} + +/* Restore callee-saved registers */ +static void pop_callee_regs(struct jit_ctx *ctx) +{ + struct bpf_prog_aux *aux = ctx->prog->aux; + int reg1, reg2, i; + + /* + * Program acting as exception boundary pushes R23 and R24 in addition + * to BPF callee-saved registers. Exception callback uses the boundary + * program's stack frame, so recover these extra registers in the above + * two cases. + */ + if (aux->exception_boundary || aux->exception_cb) { + emit(A64_POP(A64_R(27), A64_R(28), A64_SP), ctx); + emit(A64_POP(A64_R(25), A64_R(26), A64_SP), ctx); + emit(A64_POP(A64_R(23), A64_R(24), A64_SP), ctx); + emit(A64_POP(A64_R(21), A64_R(22), A64_SP), ctx); + emit(A64_POP(A64_R(19), A64_R(20), A64_SP), ctx); + } else { + i = ctx->nr_used_callee_reg - 1; + if (ctx->nr_used_callee_reg % 2 != 0) { + reg1 = ctx->used_callee_reg[i]; + emit(A64_POP(reg1, A64_ZR, A64_SP), ctx); + i--; + } + while (i > 0) { + reg1 = ctx->used_callee_reg[i - 1]; + reg2 = ctx->used_callee_reg[i]; + emit(A64_POP(reg1, reg2, A64_SP), ctx); + i -= 2; + } } } @@ -315,17 +419,12 @@ static void prepare_bpf_tail_call_cnt(struct jit_ctx *ctx) #define POKE_OFFSET (BTI_INSNS + 1) /* Tail call offset to jump into */ -#define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 10) +#define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 4) -static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, - bool is_exception_cb, u64 arena_vm_start) +static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) { const struct bpf_prog *prog = ctx->prog; const bool is_main_prog = !bpf_is_subprog(prog); - const u8 r6 = bpf2a64[BPF_REG_6]; - const u8 r7 = bpf2a64[BPF_REG_7]; - const u8 r8 = bpf2a64[BPF_REG_8]; - const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const int idx0 = ctx->idx; @@ -365,19 +464,28 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, emit(A64_MOV(1, A64_R(9), A64_LR), ctx); emit(A64_NOP, ctx); - if (!is_exception_cb) { + if (!prog->aux->exception_cb) { /* Sign lr */ if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) emit(A64_PACIASP, ctx); + /* Save FP and LR registers to stay align with ARM64 AAPCS */ emit(A64_PUSH(A64_FP, A64_LR, A64_SP), ctx); emit(A64_MOV(1, A64_FP, A64_SP), ctx); - /* Save callee-saved registers */ - emit(A64_PUSH(r6, r7, A64_SP), ctx); - emit(A64_PUSH(r8, r9, A64_SP), ctx); prepare_bpf_tail_call_cnt(ctx); - emit(A64_PUSH(A64_R(27), A64_R(28), A64_SP), ctx); + + if (!ebpf_from_cbpf && is_main_prog) { + cur_offset = ctx->idx - idx0; + if (cur_offset != PROLOGUE_OFFSET) { + pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n", + cur_offset, PROLOGUE_OFFSET); + return -1; + } + /* BTI landing pad for the tail call, done with a BR */ + emit_bti(A64_BTI_J, ctx); + } + push_callee_regs(ctx); } else { /* * Exception callback receives FP of Main Program as third @@ -394,48 +502,23 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, emit(A64_SUB_I(1, A64_SP, A64_FP, 96), ctx); } - /* Set up BPF prog stack base register */ - emit(A64_MOV(1, fp, A64_SP), ctx); - - if (!ebpf_from_cbpf && is_main_prog) { - cur_offset = ctx->idx - idx0; - if (cur_offset != PROLOGUE_OFFSET) { - pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n", - cur_offset, PROLOGUE_OFFSET); - return -1; - } - - /* BTI landing pad for the tail call, done with a BR */ - emit_bti(A64_BTI_J, ctx); - } - - /* - * Program acting as exception boundary should save all ARM64 - * Callee-saved registers as the exception callback needs to recover - * all ARM64 Callee-saved registers in its epilogue. - */ - if (prog->aux->exception_boundary) { - /* - * As we are pushing two more registers, BPF_FP should be moved - * 16 bytes - */ - emit(A64_SUB_I(1, fp, fp, 16), ctx); - emit(A64_PUSH(A64_R(23), A64_R(24), A64_SP), ctx); - } + if (ctx->fp_used) + /* Set up BPF prog stack base register */ + emit(A64_MOV(1, fp, A64_SP), ctx); /* Stack must be multiples of 16B */ ctx->stack_size = round_up(prog->aux->stack_depth, 16); /* Set up function call stack */ - emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + if (ctx->stack_size) + emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); - if (arena_vm_start) - emit_a64_mov_i64(arena_vm_base, arena_vm_start, ctx); + if (ctx->arena_vm_start) + emit_a64_mov_i64(arena_vm_base, ctx->arena_vm_start, ctx); return 0; } -static int out_offset = -1; /* initialized on the first pass of build_body() */ static int emit_bpf_tail_call(struct jit_ctx *ctx) { /* bpf_tail_call(void *prog_ctx, struct bpf_array *array, u64 index) */ @@ -446,10 +529,10 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) const u8 prg = bpf2a64[TMP_REG_2]; const u8 tcc = bpf2a64[TMP_REG_3]; const u8 ptr = bpf2a64[TCCNT_PTR]; - const int idx0 = ctx->idx; -#define cur_offset (ctx->idx - idx0) -#define jmp_offset (out_offset - (cur_offset)) size_t off; + __le32 *branch1 = NULL; + __le32 *branch2 = NULL; + __le32 *branch3 = NULL; /* if (index >= array->map.max_entries) * goto out; @@ -459,17 +542,20 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_LDR32(tmp, r2, tmp), ctx); emit(A64_MOV(0, r3, r3), ctx); emit(A64_CMP(0, r3, tmp), ctx); - emit(A64_B_(A64_COND_CS, jmp_offset), ctx); + branch1 = ctx->image + ctx->idx; + emit(A64_NOP, ctx); /* * if ((*tail_call_cnt_ptr) >= MAX_TAIL_CALL_CNT) * goto out; - * (*tail_call_cnt_ptr)++; */ emit_a64_mov_i64(tmp, MAX_TAIL_CALL_CNT, ctx); emit(A64_LDR64I(tcc, ptr, 0), ctx); emit(A64_CMP(1, tcc, tmp), ctx); - emit(A64_B_(A64_COND_CS, jmp_offset), ctx); + branch2 = ctx->image + ctx->idx; + emit(A64_NOP, ctx); + + /* (*tail_call_cnt_ptr)++; */ emit(A64_ADD_I(1, tcc, tcc, 1), ctx); /* prog = array->ptrs[index]; @@ -481,30 +567,37 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_ADD(1, tmp, r2, tmp), ctx); emit(A64_LSL(1, prg, r3, 3), ctx); emit(A64_LDR64(prg, tmp, prg), ctx); - emit(A64_CBZ(1, prg, jmp_offset), ctx); + branch3 = ctx->image + ctx->idx; + emit(A64_NOP, ctx); /* Update tail_call_cnt if the slot is populated. */ emit(A64_STR64I(tcc, ptr, 0), ctx); + /* restore SP */ + if (ctx->stack_size) + emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + + pop_callee_regs(ctx); + /* goto *(prog->bpf_func + prologue_offset); */ off = offsetof(struct bpf_prog, bpf_func); emit_a64_mov_i64(tmp, off, ctx); emit(A64_LDR64(tmp, prg, tmp), ctx); emit(A64_ADD_I(1, tmp, tmp, sizeof(u32) * PROLOGUE_OFFSET), ctx); - emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); emit(A64_BR(tmp), ctx); - /* out: */ - if (out_offset == -1) - out_offset = cur_offset; - if (cur_offset != out_offset) { - pr_err_once("tail_call out_offset = %d, expected %d!\n", - cur_offset, out_offset); - return -1; + if (ctx->image) { + off = &ctx->image[ctx->idx] - branch1; + *branch1 = cpu_to_le32(A64_B_(A64_COND_CS, off)); + + off = &ctx->image[ctx->idx] - branch2; + *branch2 = cpu_to_le32(A64_B_(A64_COND_CS, off)); + + off = &ctx->image[ctx->idx] - branch3; + *branch3 = cpu_to_le32(A64_CBZ(1, prg, off)); } + return 0; -#undef cur_offset -#undef jmp_offset } #ifdef CONFIG_ARM64_LSE_ATOMICS @@ -730,37 +823,18 @@ static void build_plt(struct jit_ctx *ctx) plt->target = (u64)&dummy_tramp; } -static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) +static void build_epilogue(struct jit_ctx *ctx) { const u8 r0 = bpf2a64[BPF_REG_0]; - const u8 r6 = bpf2a64[BPF_REG_6]; - const u8 r7 = bpf2a64[BPF_REG_7]; - const u8 r8 = bpf2a64[BPF_REG_8]; - const u8 r9 = bpf2a64[BPF_REG_9]; - const u8 fp = bpf2a64[BPF_REG_FP]; const u8 ptr = bpf2a64[TCCNT_PTR]; /* We're done with BPF stack */ - emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + if (ctx->stack_size) + emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); - /* - * Program acting as exception boundary pushes R23 and R24 in addition - * to BPF callee-saved registers. Exception callback uses the boundary - * program's stack frame, so recover these extra registers in the above - * two cases. - */ - if (ctx->prog->aux->exception_boundary || is_exception_cb) - emit(A64_POP(A64_R(23), A64_R(24), A64_SP), ctx); + pop_callee_regs(ctx); - /* Restore x27 and x28 */ - emit(A64_POP(A64_R(27), A64_R(28), A64_SP), ctx); - /* Restore fs (x25) and x26 */ - emit(A64_POP(ptr, fp, A64_SP), ctx); - emit(A64_POP(ptr, fp, A64_SP), ctx); - - /* Restore callee-saved register */ - emit(A64_POP(r8, r9, A64_SP), ctx); - emit(A64_POP(r6, r7, A64_SP), ctx); + emit(A64_POP(A64_ZR, ptr, A64_SP), ctx); /* Restore FP/LR registers */ emit(A64_POP(A64_FP, A64_LR, A64_SP), ctx); @@ -1645,7 +1719,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; - u64 arena_vm_start; u8 *image_ptr; u8 *ro_image_ptr; @@ -1663,7 +1736,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = tmp; } - arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); @@ -1694,6 +1766,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); + ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); /* * 1. Initial fake pass to compute ctx->idx and ctx->offset. @@ -1701,8 +1774,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * BPF line info needs ctx->offset[i] to be the offset of * instruction[i] in jited image, so build prologue first. */ - if (build_prologue(&ctx, was_classic, prog->aux->exception_cb, - arena_vm_start)) { + if (build_prologue(&ctx, was_classic)) { prog = orig_prog; goto out_off; } @@ -1713,7 +1785,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.epilogue_offset = ctx.idx; - build_epilogue(&ctx, prog->aux->exception_cb); + build_epilogue(&ctx); build_plt(&ctx); extable_align = __alignof__(struct exception_table_entry); @@ -1750,14 +1822,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.idx = 0; ctx.exentry_idx = 0; - build_prologue(&ctx, was_classic, prog->aux->exception_cb, arena_vm_start); + build_prologue(&ctx, was_classic); if (build_body(&ctx, extra_pass)) { prog = orig_prog; goto out_free_hdr; } - build_epilogue(&ctx, prog->aux->exception_cb); + build_epilogue(&ctx); build_plt(&ctx); /* 3. Extra pass to validate JITed code. */ From c264487e5410e5a72db8a414566ab7d144223e6c Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Tue, 20 Aug 2024 10:36:22 +0800 Subject: [PATCH 1029/1052] selftests/bpf: Fix incorrect parameters in NULL pointer checking Smatch reported the following warning: ./tools/testing/selftests/bpf/testing_helpers.c:455 get_xlated_program() warn: variable dereferenced before check 'buf' (see line 454) It seems correct,so let's modify it based on it's suggestion. Actually,commit b23ed4d74c4d ("selftests/bpf: Fix invalid pointer check in get_xlated_program()") fixed an issue in the test_verifier.c once,but it was reverted this time. Let's solve this issue with the minimal changes possible. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/1eb3732f-605a-479d-ba64-cd14250cbf91@stanley.mountain/ Fixes: b4b7a4099b8c ("selftests/bpf: Factor out get_xlated_program() helper") Signed-off-by: Hao Ge Link: https://lore.kernel.org/r/20240820023622.29190-1-hao.ge@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/testing_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index c217e12bd9da..d3c3c3a24150 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -452,7 +452,7 @@ int get_xlated_program(int fd_prog, struct bpf_insn **buf, __u32 *cnt) *cnt = xlated_prog_len / buf_element_size; *buf = calloc(*cnt, buf_element_size); - if (!buf) { + if (!*buf) { perror("can't allocate xlated program buffer"); return -ENOMEM; } From f633919d132cf1755d62278c989124c0ce23950f Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Wed, 28 Aug 2024 20:48:11 +0100 Subject: [PATCH 1030/1052] bpf: Relax KF_ACQUIRE kfuncs strict type matching constraint Currently we cannot pass zero offset (implicit cast) or non-zero offset pointers to KF_ACQUIRE kfuncs. This is because KF_ACQUIRE kfuncs requires strict type matching, but zero offset or non-zero offset does not change the type of pointer, which causes the ebpf program to be rejected by the verifier. This can cause some problems, one example is that bpf_skb_peek_tail kfunc [0] cannot be implemented by just passing in non-zero offset pointers. We cannot pass pointers like &sk->sk_write_queue (non-zero offset) or &sk->__sk_common (zero offset) to KF_ACQUIRE kfuncs. This patch makes KF_ACQUIRE kfuncs not require strict type matching. [0]: https://lore.kernel.org/bpf/AM6PR03MB5848CA39CB4B7A4397D380B099B12@AM6PR03MB5848.eurprd03.prod.outlook.com/ Signed-off-by: Juntong Deng Link: https://lore.kernel.org/r/AM6PR03MB5848FD2BD89BF0B6B5AA3B4C99952@AM6PR03MB5848.eurprd03.prod.outlook.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 33270b363689..f32e3b9bb4e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11513,8 +11513,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, * btf_struct_ids_match() to walk the struct at the 0th offset, and * resolve types. */ - if (is_kfunc_acquire(meta) || - (is_kfunc_release(meta) && reg->ref_obj_id) || + if ((is_kfunc_release(meta) && reg->ref_obj_id) || btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; From 6db59c4935c9354f4c33fc63287d110317ca4d70 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Wed, 28 Aug 2024 20:51:32 +0100 Subject: [PATCH 1031/1052] selftests/bpf: Add test for zero offset or non-zero offset pointers as KF_ACQUIRE kfuncs argument This patch adds test cases for zero offset (implicit cast) or non-zero offset pointer as KF_ACQUIRE kfuncs argument. Currently KF_ACQUIRE kfuncs should support passing in pointers like &sk->sk_write_queue (non-zero offset) or &sk->__sk_common (zero offset) and not be rejected by the verifier. Signed-off-by: Juntong Deng Link: https://lore.kernel.org/r/AM6PR03MB5848CB6F0D4D9068669A905B99952@AM6PR03MB5848.eurprd03.prod.outlook.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 17 ++++++++++ .../bpf/bpf_testmod/bpf_testmod_kfunc.h | 4 +++ .../selftests/bpf/prog_tests/nested_trust.c | 4 +++ .../selftests/bpf/progs/nested_acquire.c | 33 +++++++++++++++++++ 4 files changed, 58 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/nested_acquire.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index c04b7dec2ab9..8a71a91b752d 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -183,6 +183,20 @@ __bpf_kfunc void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr, { } +__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) +{ + return NULL; +} + +__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) +{ + return NULL; +} + +__bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr) +{ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -541,6 +555,9 @@ BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value) BTF_ID_FLAGS(func, bpf_kfunc_common_test) BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) +BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index e587a79f2239..c6c314965bb1 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -144,4 +144,8 @@ void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr, struct bpf_dynptr *ptr__nulla struct bpf_testmod_ctx *bpf_testmod_ctx_create(int *err) __ksym; void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) __ksym; +struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) __ksym; +struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym; +void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/nested_trust.c b/tools/testing/selftests/bpf/prog_tests/nested_trust.c index 39886f58924e..54a112ad5f9c 100644 --- a/tools/testing/selftests/bpf/prog_tests/nested_trust.c +++ b/tools/testing/selftests/bpf/prog_tests/nested_trust.c @@ -4,9 +4,13 @@ #include #include "nested_trust_failure.skel.h" #include "nested_trust_success.skel.h" +#include "nested_acquire.skel.h" void test_nested_trust(void) { RUN_TESTS(nested_trust_success); RUN_TESTS(nested_trust_failure); + + if (env.has_testmod) + RUN_TESTS(nested_acquire); } diff --git a/tools/testing/selftests/bpf/progs/nested_acquire.c b/tools/testing/selftests/bpf/progs/nested_acquire.c new file mode 100644 index 000000000000..8e521a21d995 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/nested_acquire.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +SEC("tp_btf/tcp_probe") +__success +int BPF_PROG(test_nested_acquire_nonzero, struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *ptr; + + ptr = bpf_kfunc_nested_acquire_nonzero_offset_test(&sk->sk_write_queue); + + bpf_kfunc_nested_release_test(ptr); + return 0; +} + +SEC("tp_btf/tcp_probe") +__success +int BPF_PROG(test_nested_acquire_zero, struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *ptr; + + ptr = bpf_kfunc_nested_acquire_zero_offset_test(&sk->__sk_common); + + bpf_kfunc_nested_release_test(ptr); + return 0; +} From c634d6f4e12d00c954410ba11db45799a8c77b5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 27 Aug 2024 13:37:21 -0700 Subject: [PATCH 1032/1052] libbpf: Fix bpf_object__open_skeleton()'s mishandling of options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do an ugly copying of options in bpf_object__open_skeleton() just to be able to set object name from skeleton's recorded name (while still allowing user to override it through opts->object_name). This is not just ugly, but it also is broken due to memcpy() that doesn't take into account potential skel_opts' and user-provided opts' sizes differences due to backward and forward compatibility. This leads to copying over extra bytes and then failing to validate options properly. It could, technically, lead also to SIGSEGV, if we are unlucky. So just get rid of that memory copy completely and instead pass default object name into bpf_object_open() directly, simplifying all this significantly. The rule now is that obj_name should be non-NULL for bpf_object_open() when called with in-memory buffer, so validate that explicitly as well. We adopt bpf_object__open_mem() to this as well and generate default name (based on buffer memory address and size) outside of bpf_object_open(). Fixes: d66562fba1ce ("libbpf: Add BPF object skeleton support") Reported-by: Daniel Müller Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Daniel Müller Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240827203721.1145494-1-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 52 +++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e55353887439..d3a542649e6b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7905,16 +7905,19 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object } static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, + const char *obj_name, const struct bpf_object_open_opts *opts) { - const char *obj_name, *kconfig, *btf_tmp_path, *token_path; + const char *kconfig, *btf_tmp_path, *token_path; struct bpf_object *obj; - char tmp_name[64]; int err; char *log_buf; size_t log_size; __u32 log_level; + if (obj_buf && !obj_name) + return ERR_PTR(-EINVAL); + if (elf_version(EV_CURRENT) == EV_NONE) { pr_warn("failed to init libelf for %s\n", path ? : "(mem buf)"); @@ -7924,16 +7927,12 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, if (!OPTS_VALID(opts, bpf_object_open_opts)) return ERR_PTR(-EINVAL); - obj_name = OPTS_GET(opts, object_name, NULL); + obj_name = OPTS_GET(opts, object_name, NULL) ?: obj_name; if (obj_buf) { - if (!obj_name) { - snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", - (unsigned long)obj_buf, - (unsigned long)obj_buf_sz); - obj_name = tmp_name; - } path = obj_name; pr_debug("loading object '%s' from buffer\n", obj_name); + } else { + pr_debug("loading object from %s\n", path); } log_buf = OPTS_GET(opts, kernel_log_buf, NULL); @@ -8017,9 +8016,7 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) if (!path) return libbpf_err_ptr(-EINVAL); - pr_debug("loading %s\n", path); - - return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); + return libbpf_ptr(bpf_object_open(path, NULL, 0, NULL, opts)); } struct bpf_object *bpf_object__open(const char *path) @@ -8031,10 +8028,15 @@ struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) { + char tmp_name[64]; + if (!obj_buf || obj_buf_sz == 0) return libbpf_err_ptr(-EINVAL); - return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); + /* create a (quite useless) default "name" for this memory buffer object */ + snprintf(tmp_name, sizeof(tmp_name), "%lx-%zx", (unsigned long)obj_buf, obj_buf_sz); + + return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, tmp_name, opts)); } static int bpf_object_unload(struct bpf_object *obj) @@ -13761,29 +13763,13 @@ static int populate_skeleton_progs(const struct bpf_object *obj, int bpf_object__open_skeleton(struct bpf_object_skeleton *s, const struct bpf_object_open_opts *opts) { - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts, - .object_name = s->name, - ); struct bpf_object *obj; int err; - /* Attempt to preserve opts->object_name, unless overriden by user - * explicitly. Overwriting object name for skeletons is discouraged, - * as it breaks global data maps, because they contain object name - * prefix as their own map name prefix. When skeleton is generated, - * bpftool is making an assumption that this name will stay the same. - */ - if (opts) { - memcpy(&skel_opts, opts, sizeof(*opts)); - if (!opts->object_name) - skel_opts.object_name = s->name; - } - - obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts); - err = libbpf_get_error(obj); - if (err) { - pr_warn("failed to initialize skeleton BPF object '%s': %d\n", - s->name, err); + obj = bpf_object_open(NULL, s->data, s->data_sz, s->name, opts); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + pr_warn("failed to initialize skeleton BPF object '%s': %d\n", s->name, err); return libbpf_err(err); } From bd0b4836a2333d5c725397d458c8edfa66f1d9bb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 27 Aug 2024 01:13:01 +0000 Subject: [PATCH 1033/1052] selftests/bpf: Make sure stashed kptr in local kptr is freed recursively When dropping a local kptr, any kptr stashed into it is supposed to be freed through bpf_obj_free_fields->__bpf_obj_drop_impl recursively. Add a test to make sure it happens. The test first stashes a referenced kptr to "struct task" into a local kptr and gets the reference count of the task. Then, it drops the local kptr and reads the reference count of the task again. Since bpf_obj_free_fields and __bpf_obj_drop_impl will go through the local kptr recursively during bpf_obj_drop, the dtor of the stashed task kptr should eventually be called. The second reference count should be one less than the first one. Signed-off-by: Amery Hung Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240827011301.608620-1-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/task_kfunc_success.c | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 3138bb689b0b..a55149015063 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -143,8 +143,9 @@ int BPF_PROG(test_task_acquire_leave_in_map, struct task_struct *task, u64 clone SEC("tp_btf/task_newtask") int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) { - struct task_struct *kptr; + struct task_struct *kptr, *acquired; struct __tasks_kfunc_map_value *v, *local; + int refcnt, refcnt_after_drop; long status; if (!is_test_kfunc_task()) @@ -190,7 +191,34 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) return 0; } + /* Stash a copy into local kptr and check if it is released recursively */ + acquired = bpf_task_acquire(kptr); + if (!acquired) { + err = 7; + bpf_obj_drop(local); + bpf_task_release(kptr); + return 0; + } + bpf_probe_read_kernel(&refcnt, sizeof(refcnt), &acquired->rcu_users); + + acquired = bpf_kptr_xchg(&local->task, acquired); + if (acquired) { + err = 8; + bpf_obj_drop(local); + bpf_task_release(kptr); + bpf_task_release(acquired); + return 0; + } + bpf_obj_drop(local); + + bpf_probe_read_kernel(&refcnt_after_drop, sizeof(refcnt_after_drop), &kptr->rcu_users); + if (refcnt != refcnt_after_drop + 1) { + err = 9; + bpf_task_release(kptr); + return 0; + } + bpf_task_release(kptr); return 0; From 89dd9bb25597621ca0500aa598140ff0858091e2 Mon Sep 17 00:00:00 2001 From: Yiming Xiang Date: Wed, 28 Aug 2024 23:17:12 -0400 Subject: [PATCH 1034/1052] docs/bpf: Fix a typo in verifier.rst In verifier.rst, there is a typo in section 'Register parentage chains'. Caller saved registers are r0-r5, callee saved registers are r6-r9. Here by context it means callee saved registers rather than caller saved registers. This may confuse users. Signed-off-by: Yiming Xiang Link: https://lore.kernel.org/r/20240829031712.198489-1-kxiang@umich.edu Signed-off-by: Alexei Starovoitov --- Documentation/bpf/verifier.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst index 356894399fbf..d23761540002 100644 --- a/Documentation/bpf/verifier.rst +++ b/Documentation/bpf/verifier.rst @@ -418,7 +418,7 @@ The rules for correspondence between registers / stack slots are as follows: linked to the registers and stack slots of the parent state with the same indices. -* For the outer stack frames, only caller saved registers (r6-r9) and stack +* For the outer stack frames, only callee saved registers (r6-r9) and stack slots are linked to the registers and stack slots of the parent state with the same indices. From c6d9dafb595564ae44ef2e25e3ace2973aa867f3 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 28 Aug 2024 14:21:28 +0800 Subject: [PATCH 1035/1052] bpf: Use kvmemdup to simplify the code Use kvmemdup instead of kvmalloc() + memcpy() to simplify the code. No functional change intended. Acked-by: Yonghong Song Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240828062128.1223417-1-lihongbo22@huawei.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index edad152cee8e..1e29281653c6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6272,12 +6272,11 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->kernel_btf = true; snprintf(btf->name, sizeof(btf->name), "%s", module_name); - btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN); + btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); if (!btf->data) { err = -ENOMEM; goto errout; } - memcpy(btf->data, data, data_size); btf->data_size = data_size; err = btf_parse_hdr(env); From 6f606ffd6dd7583d8194ee3d858ba4da2eff26a3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:23 -0700 Subject: [PATCH 1036/1052] bpf: Move insn_buf[16] to bpf_verifier_env This patch moves the 'struct bpf_insn insn_buf[16]' stack usage to the bpf_verifier_env. A '#define INSN_BUF_SIZE 16' is also added to replace the ARRAY_SIZE(insn_buf) usages. Both convert_ctx_accesses() and do_misc_fixup() are changed to use the env->insn_buf. It is a refactoring work for adding the epilogue_buf[16] in a later patch. With this patch, the stack size usage decreased. Before: ./kernel/bpf/verifier.c:22133:5: warning: stack frame size (2584) After: ./kernel/bpf/verifier.c:22184:5: warning: stack frame size (2264) Reviewed-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ kernel/bpf/verifier.c | 15 ++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 279b4a640644..0ad2d189c546 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -23,6 +23,8 @@ * (in the "-8,-16,...,-512" form) */ #define TMP_STR_BUF_LEN 320 +/* Patch buffer size */ +#define INSN_BUF_SIZE 16 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -780,6 +782,7 @@ struct bpf_verifier_env { * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; + struct bpf_insn insn_buf[INSN_BUF_SIZE]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f32e3b9bb4e5..261849384ea8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19677,7 +19677,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) const struct bpf_verifier_ops *ops = env->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16], *insn; + struct bpf_insn *insn_buf = env->insn_buf; + struct bpf_insn *insn; u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; @@ -19690,7 +19691,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); - if (cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { @@ -19837,7 +19838,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) target_size = 0; cnt = convert_ctx_access(type, insn, insn_buf, env->prog, &target_size); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || + if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -19846,7 +19847,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; - if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + if (shift && cnt + 1 >= INSN_BUF_SIZE) { verbose(env, "bpf verifier narrow ctx load misconfigured\n"); return -EINVAL; } @@ -20391,7 +20392,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) const int insn_cnt = prog->len; const struct bpf_map_ops *ops; struct bpf_insn_aux_data *aux; - struct bpf_insn insn_buf[16]; + struct bpf_insn *insn_buf = env->insn_buf; struct bpf_prog *new_prog; struct bpf_map *map_ptr; int i, ret, cnt, delta = 0, cur_subprog = 0; @@ -20510,7 +20511,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (BPF_MODE(insn->code) == BPF_ABS || BPF_MODE(insn->code) == BPF_IND)) { cnt = env->ops->gen_ld_abs(insn, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -20803,7 +20804,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) cnt = ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == -EOPNOTSUPP) goto patch_map_ops_generic; - if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } From d5c47719f24438838e60bcbf97008179d6f737a8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:24 -0700 Subject: [PATCH 1037/1052] bpf: Adjust BPF_JMP that jumps to the 1st insn of the prologue The next patch will add a ctx ptr saving instruction "(r1 = *(u64 *)(r10 -8)" at the beginning for the main prog when there is an epilogue patch (by the .gen_epilogue() verifier ops added in the next patch). There is one corner case if the bpf prog has a BPF_JMP that jumps to the 1st instruction. It needs an adjustment such that those BPF_JMP instructions won't jump to the newly added ctx saving instruction. The commit 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and jump to the 1st insn.") has the details on this case. Note that the jump back to 1st instruction is not limited to the ctx ptr saving instruction. The same also applies to the prologue. A later test, pro_epilogue_goto_start.c, has a test for the prologue only case. Thus, this patch does one adjustment after gen_prologue and the future ctx ptr saving. It is done by adjust_jmp_off(env->prog, 0, delta) where delta has the total number of instructions in the prologue and the future ctx ptr saving instruction. The adjust_jmp_off(env->prog, 0, delta) assumes that the prologue does not have a goto 1st instruction itself. To accommodate the prologue might have a goto 1st insn itself, this patch changes the adjust_jmp_off() to skip considering the instructions between [tgt_idx, tgt_idx + delta). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 261849384ea8..03e974129c05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19286,6 +19286,9 @@ static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) for (i = 0; i < insn_cnt; i++, insn++) { u8 code = insn->code; + if (tgt_idx <= i && i < tgt_idx + delta) + continue; + if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) continue; @@ -19704,6 +19707,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } + if (delta) + WARN_ON(adjust_jmp_off(env->prog, 0, delta)); + if (bpf_prog_is_offloaded(env->prog->aux)) return 0; From 169c31761c8d7f606f3ee628829c27998626c4f0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:25 -0700 Subject: [PATCH 1038/1052] bpf: Add gen_epilogue to bpf_verifier_ops This patch adds a .gen_epilogue to the bpf_verifier_ops. It is similar to the existing .gen_prologue. Instead of allowing a subsystem to run code at the beginning of a bpf prog, it allows the subsystem to run code just before the bpf prog exit. One of the use case is to allow the upcoming bpf qdisc to ensure that the skb->dev is the same as the qdisc->dev_queue->dev. The bpf qdisc struct_ops implementation could either fix it up or drop the skb. Another use case could be in bpf_tcp_ca.c to enforce snd_cwnd has sane value (e.g. non zero). The epilogue can do the useful thing (like checking skb->dev) if it can access the bpf prog's ctx. Unlike prologue, r1 may not hold the ctx pointer. This patch saves the r1 in the stack if the .gen_epilogue has returned some instructions in the "epilogue_buf". The existing .gen_prologue is done in convert_ctx_accesses(). The new .gen_epilogue is done in the convert_ctx_accesses() also. When it sees the (BPF_JMP | BPF_EXIT) instruction, it will be patched with the earlier generated "epilogue_buf". The epilogue patching is only done for the main prog. Only one epilogue will be patched to the main program. When the bpf prog has multiple BPF_EXIT instructions, a BPF_JA is used to goto the earlier patched epilogue. Majority of the archs support (BPF_JMP32 | BPF_JA): x86, arm, s390, risv64, loongarch, powerpc and arc. This patch keeps it simple and always use (BPF_JMP32 | BPF_JA). A new macro BPF_JMP32_A is added to generate the (BPF_JMP32 | BPF_JA) insn. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 1 + include/linux/filter.h | 10 ++++++++ kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++++++++++- 4 files changed, 58 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63083f76b7..6f87fb014fba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -974,6 +974,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, + s16 ctx_stack_off); int (*gen_ld_abs)(const struct bpf_insn *orig, struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0ad2d189c546..2e20207315a9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -783,6 +783,7 @@ struct bpf_verifier_env { */ char tmp_str_buf[TMP_STR_BUF_LEN]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; + struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/include/linux/filter.h b/include/linux/filter.h index b6672ff61407..99b6fc83825b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -437,6 +437,16 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn) .off = OFF, \ .imm = 0 }) +/* Unconditional jumps, gotol pc + imm32 */ + +#define BPF_JMP32_A(IMM) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + /* Relative call */ #define BPF_CALL_REL(TGT) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 03e974129c05..fdb1d39c8b58 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19677,15 +19677,39 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { + struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0; + int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; + struct bpf_insn *epilogue_buf = env->epilogue_buf; struct bpf_insn *insn_buf = env->insn_buf; struct bpf_insn *insn; u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; + int epilogue_idx = 0; + + if (ops->gen_epilogue) { + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, + -(subprogs[0].stack_depth + 8)); + if (epilogue_cnt >= INSN_BUF_SIZE) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (epilogue_cnt) { + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ + cnt = 0; + subprogs[0].stack_depth += 8; + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, + -subprogs[0].stack_depth); + insn_buf[cnt++] = env->prog->insnsi[0]; + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + delta += cnt - 1; + } + } if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { @@ -19742,6 +19766,25 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; + } else if (insn->code == (BPF_JMP | BPF_EXIT) && + epilogue_cnt && + i + delta < subprogs[1].start) { + /* Generate epilogue for the main prog */ + if (epilogue_idx) { + /* jump back to the earlier generated epilogue */ + insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1); + cnt = 1; + } else { + memcpy(insn_buf, epilogue_buf, + epilogue_cnt * sizeof(*epilogue_buf)); + cnt = epilogue_cnt; + /* epilogue_idx cannot be 0. It must have at + * least one ctx ptr saving insn before the + * epilogue. + */ + epilogue_idx = i + delta; + } + goto patch_insn_buf; } else { continue; } @@ -19878,6 +19921,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, insn->dst_reg, size * 8, 0); +patch_insn_buf: new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; From 866d571e6201cb8ccb18cb8407ab3ad3adb474b8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:26 -0700 Subject: [PATCH 1039/1052] bpf: Export bpf_base_func_proto The bpf_testmod needs to use the bpf_tail_call helper in a later selftest patch. This patch is to EXPORT_GPL_SYMBOL the bpf_base_func_proto. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7e7761c537f8..3956be5d6440 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2034,6 +2034,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } } +EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) From a0dbf6d0b21e197bf919591081ff2eb7a34ef982 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 29 Aug 2024 14:08:27 -0700 Subject: [PATCH 1040/1052] selftests/bpf: attach struct_ops maps before test prog runs In test_loader based tests to bpf_map__attach_struct_ops() before call to bpf_prog_test_run_opts() in order to trigger bpf_struct_ops->reg() callbacks on kernel side. This allows to use __retval macro for struct_ops tests. Signed-off-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 4223cffc090e..3e9b009580d4 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -890,11 +890,13 @@ void run_subtest(struct test_loader *tester, { struct test_subspec *subspec = unpriv ? &spec->unpriv : &spec->priv; struct bpf_program *tprog = NULL, *tprog_iter; + struct bpf_link *link, *links[32] = {}; struct test_spec *spec_iter; struct cap_state caps = {}; struct bpf_object *tobj; struct bpf_map *map; int retval, err, i; + int links_cnt = 0; bool should_load; if (!test__start_subtest(subspec->name)) @@ -999,6 +1001,26 @@ void run_subtest(struct test_loader *tester, if (restore_capabilities(&caps)) goto tobj_cleanup; + /* Do bpf_map__attach_struct_ops() for each struct_ops map. + * This should trigger bpf_struct_ops->reg callback on kernel side. + */ + bpf_object__for_each_map(map, tobj) { + if (!bpf_map__autocreate(map) || + bpf_map__type(map) != BPF_MAP_TYPE_STRUCT_OPS) + continue; + if (links_cnt >= ARRAY_SIZE(links)) { + PRINT_FAIL("too many struct_ops maps"); + goto tobj_cleanup; + } + link = bpf_map__attach_struct_ops(map); + if (!link) { + PRINT_FAIL("bpf_map__attach_struct_ops failed for map %s: err=%d\n", + bpf_map__name(map), err); + goto tobj_cleanup; + } + links[links_cnt++] = link; + } + if (tester->pre_execution_cb) { err = tester->pre_execution_cb(tobj); if (err) { @@ -1013,9 +1035,14 @@ void run_subtest(struct test_loader *tester, PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; } + /* redo bpf_map__attach_struct_ops for each test */ + while (links_cnt > 0) + bpf_link__destroy(links[--links_cnt]); } tobj_cleanup: + while (links_cnt > 0) + bpf_link__destroy(links[--links_cnt]); bpf_object__close(tobj); subtest_cleanup: test__end_subtest(); From 47e69431b57aee6c3c134014e7fca490ca8ca9d1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:28 -0700 Subject: [PATCH 1041/1052] selftests/bpf: Test gen_prologue and gen_epilogue This test adds a new struct_ops "bpf_testmod_st_ops" in bpf_testmod. The ops of the bpf_testmod_st_ops is triggered by new kfunc calls "bpf_kfunc_st_ops_test_*logue". These new kfunc calls are primarily used by the SEC("syscall") program. The test triggering sequence is like: SEC("syscall") syscall_prologue(struct st_ops_args *args) bpf_kfunc_st_op_test_prologue(args) st_ops->test_prologue(args) .gen_prologue adds 1000 to args->a .gen_epilogue adds 10000 to args->a .gen_epilogue will also set the r0 to 2 * args->a. The .gen_prologue and .gen_epilogue of the bpf_testmod_st_ops will test the prog->aux->attach_func_name to decide if it needs to generate codes. The main programs of the pro_epilogue.c will call a new kfunc bpf_kfunc_st_ops_inc10 which does "args->a += 10". It will also call a subprog() which does "args->a += 1". This patch uses the test_loader infra to check the __xlated instructions patched after gen_prologue and/or gen_epilogue. The __xlated check is based on Eduard's example (Thanks!) in v1. args->a is returned by the struct_ops prog (either the main prog or the epilogue). Thus, the __retval of the SEC("syscall") prog is checked. For example, when triggering the ops in the 'SEC("struct_ops/test_epilogue") int test_epilogue' The expected args->a is +1 (subprog call) + 10 (kfunc call) + 10000 (.gen_epilogue) = 10011. The expected return value is 2 * 10011 (.gen_epilogue). Suggested-by: Eduard Zingerman Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 190 ++++++++++++++++++ .../selftests/bpf/bpf_testmod/bpf_testmod.h | 11 + .../bpf/bpf_testmod/bpf_testmod_kfunc.h | 6 + .../selftests/bpf/prog_tests/pro_epilogue.c | 10 + .../selftests/bpf/progs/pro_epilogue.c | 154 ++++++++++++++ 5 files changed, 371 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/pro_epilogue.c create mode 100644 tools/testing/selftests/bpf/progs/pro_epilogue.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 8a71a91b752d..42c38dbb6835 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "bpf_testmod.h" @@ -945,6 +946,51 @@ __bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) return err; } +static DEFINE_MUTEX(st_ops_mutex); +static struct bpf_testmod_st_ops *st_ops; + +__bpf_kfunc int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_prologue) + ret = st_ops->test_prologue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_epilogue) + ret = st_ops->test_epilogue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_pro_epilogue) + ret = st_ops->test_pro_epilogue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) +{ + args->a += 10; + return args->a; +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -981,6 +1027,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1100,6 +1150,144 @@ struct bpf_struct_ops bpf_testmod_ops2 = { .owner = THIS_MODULE, }; +static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args) +{ + return 0; +} + +static int bpf_test_mod_st_ops__test_epilogue(struct st_ops_args *args) +{ + return 0; +} + +static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args) +{ + return 0; +} + +static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (strcmp(prog->aux->attach_func_name, "test_prologue") && + strcmp(prog->aux->attach_func_name, "test_pro_epilogue")) + return 0; + + /* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx". + * r7 = r6->a; + * r7 += 1000; + * r6->a = r7; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_6, offsetof(struct st_ops_args, a)); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1000); + *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, offsetof(struct st_ops_args, a)); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +static int st_ops_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (strcmp(prog->aux->attach_func_name, "test_epilogue") && + strcmp(prog->aux->attach_func_name, "test_pro_epilogue")) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct st_ops *args" + * r6 = r1->a; + * r6 += 10000; + * r1->a = r6; + * r0 = r6; + * r0 *= 2; + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, offsetof(struct st_ops_args, a)); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 10000); + *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, offsetof(struct st_ops_args, a)); + *insn++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_6); + *insn++ = BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, 2); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +static int st_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + if (off < 0 || off + size > sizeof(struct st_ops_args)) + return -EACCES; + return 0; +} + +static const struct bpf_verifier_ops st_ops_verifier_ops = { + .is_valid_access = bpf_testmod_ops_is_valid_access, + .btf_struct_access = st_ops_btf_struct_access, + .gen_prologue = st_ops_gen_prologue, + .gen_epilogue = st_ops_gen_epilogue, + .get_func_proto = bpf_base_func_proto, +}; + +static struct bpf_testmod_st_ops st_ops_cfi_stubs = { + .test_prologue = bpf_test_mod_st_ops__test_prologue, + .test_epilogue = bpf_test_mod_st_ops__test_epilogue, + .test_pro_epilogue = bpf_test_mod_st_ops__test_pro_epilogue, +}; + +static int st_ops_reg(void *kdata, struct bpf_link *link) +{ + int err = 0; + + mutex_lock(&st_ops_mutex); + if (st_ops) { + pr_err("st_ops has already been registered\n"); + err = -EEXIST; + goto unlock; + } + st_ops = kdata; + +unlock: + mutex_unlock(&st_ops_mutex); + return err; +} + +static void st_ops_unreg(void *kdata, struct bpf_link *link) +{ + mutex_lock(&st_ops_mutex); + st_ops = NULL; + mutex_unlock(&st_ops_mutex); +} + +static int st_ops_init(struct btf *btf) +{ + return 0; +} + +static int st_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static struct bpf_struct_ops testmod_st_ops = { + .verifier_ops = &st_ops_verifier_ops, + .init = st_ops_init, + .init_member = st_ops_init_member, + .reg = st_ops_reg, + .unreg = st_ops_unreg, + .cfi_stubs = &st_ops_cfi_stubs, + .name = "bpf_testmod_st_ops", + .owner = THIS_MODULE, +}; + extern int bpf_fentry_test1(int a); static int bpf_testmod_init(void) @@ -1117,8 +1305,10 @@ static int bpf_testmod_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_testmod_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_testmod_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set); ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops); ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2); + ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops); ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors, ARRAY_SIZE(bpf_testmod_dtors), THIS_MODULE); diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index fe0d402b0d65..fb7dff47597a 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -94,4 +94,15 @@ struct bpf_testmod_ops2 { int (*test_1)(void); }; +struct st_ops_args { + u64 a; +}; + +struct bpf_testmod_st_ops { + int (*test_prologue)(struct st_ops_args *args); + int (*test_epilogue)(struct st_ops_args *args); + int (*test_pro_epilogue)(struct st_ops_args *args); + struct module *owner; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index c6c314965bb1..7e76532be5fa 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -148,4 +148,10 @@ struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym; void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym; +struct st_ops_args; +int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) __ksym; +int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym; +int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym; +int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c new file mode 100644 index 000000000000..00b806804b99 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include "pro_epilogue.skel.h" + +void test_pro_epilogue(void) +{ + RUN_TESTS(pro_epilogue); +} diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue.c b/tools/testing/selftests/bpf/progs/pro_epilogue.c new file mode 100644 index 000000000000..44bc3f06b4b6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pro_epilogue.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +void __kfunc_btf_root(void) +{ + bpf_kfunc_st_ops_inc10(NULL); +} + +static __noinline __used int subprog(struct st_ops_args *args) +{ + args->a += 1; + return args->a; +} + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* main prog */ +__xlated("4: r1 = *(u64 *)(r1 +0)") +__xlated("5: r6 = r1") +__xlated("6: call kernel-function") +__xlated("7: r1 = r6") +__xlated("8: call pc+1") +__xlated("9: exit") +SEC("struct_ops/test_prologue") +__naked int test_prologue(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r6 = r1;" + "call %[bpf_kfunc_st_ops_inc10];" + "r1 = r6;" + "call subprog;" + "exit;" + : + : __imm(bpf_kfunc_st_ops_inc10) + : __clobber_all); +} + +__success +/* save __u64 *ctx to stack */ +__xlated("0: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("1: r1 = *(u64 *)(r1 +0)") +__xlated("2: r6 = r1") +__xlated("3: call kernel-function") +__xlated("4: r1 = r6") +__xlated("5: call pc+") +/* epilogue */ +__xlated("6: r1 = *(u64 *)(r10 -8)") +__xlated("7: r1 = *(u64 *)(r1 +0)") +__xlated("8: r6 = *(u64 *)(r1 +0)") +__xlated("9: r6 += 10000") +__xlated("10: *(u64 *)(r1 +0) = r6") +__xlated("11: r0 = r6") +__xlated("12: r0 *= 2") +__xlated("13: exit") +SEC("struct_ops/test_epilogue") +__naked int test_epilogue(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r6 = r1;" + "call %[bpf_kfunc_st_ops_inc10];" + "r1 = r6;" + "call subprog;" + "exit;" + : + : __imm(bpf_kfunc_st_ops_inc10) + : __clobber_all); +} + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* save __u64 *ctx to stack */ +__xlated("4: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("5: r1 = *(u64 *)(r1 +0)") +__xlated("6: r6 = r1") +__xlated("7: call kernel-function") +__xlated("8: r1 = r6") +__xlated("9: call pc+") +/* epilogue */ +__xlated("10: r1 = *(u64 *)(r10 -8)") +__xlated("11: r1 = *(u64 *)(r1 +0)") +__xlated("12: r6 = *(u64 *)(r1 +0)") +__xlated("13: r6 += 10000") +__xlated("14: *(u64 *)(r1 +0) = r6") +__xlated("15: r0 = r6") +__xlated("16: r0 *= 2") +__xlated("17: exit") +SEC("struct_ops/test_pro_epilogue") +__naked int test_pro_epilogue(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r6 = r1;" + "call %[bpf_kfunc_st_ops_inc10];" + "r1 = r6;" + "call subprog;" + "exit;" + : + : __imm(bpf_kfunc_st_ops_inc10) + : __clobber_all); +} + +SEC("syscall") +__retval(1011) /* PROLOGUE_A [1000] + KFUNC_INC10 + SUBPROG_A [1] */ +int syscall_prologue(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_prologue(&args); +} + +SEC("syscall") +__retval(20022) /* (KFUNC_INC10 + SUBPROG_A [1] + EPILOGUE_A [10000]) * 2 */ +int syscall_epilogue(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} + +SEC("syscall") +__retval(22022) /* (PROLOGUE_A [1000] + KFUNC_INC10 + SUBPROG_A [1] + EPILOGUE_A [10000]) * 2 */ +int syscall_pro_epilogue(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_pro_epilogue(&args); +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops pro_epilogue = { + .test_prologue = (void *)test_prologue, + .test_epilogue = (void *)test_epilogue, + .test_pro_epilogue = (void *)test_pro_epilogue, +}; From b191b0fd740062ede672693671c6e6e942fb02f4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:29 -0700 Subject: [PATCH 1042/1052] selftests/bpf: Add tailcall epilogue test This patch adds a gen_epilogue test to test a main prog using a bpf_tail_call. A non test_loader test is used. The tailcall target program, "test_epilogue_subprog", needs to be used in a struct_ops map before it can be loaded. Another struct_ops map is also needed to host the actual "test_epilogue_tailcall" struct_ops program that does the bpf_tail_call. The earlier test_loader patch will attach all struct_ops maps but the bpf_testmod.c does not support >1 attached struct_ops. The earlier patch used the test_loader which has already covered checking for the patched pro/epilogue instructions. This is done by the __xlated tag. This patch goes for the regular skel load and syscall test to do the tailcall test that can also allow to directly pass the the "struct st_ops_args *args" as ctx_in to the SEC("syscall") program. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/pro_epilogue.c | 46 +++++++++++++++ .../selftests/bpf/progs/epilogue_tailcall.c | 58 +++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/epilogue_tailcall.c diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c index 00b806804b99..b82525c29de8 100644 --- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -3,8 +3,54 @@ #include #include "pro_epilogue.skel.h" +#include "epilogue_tailcall.skel.h" + +struct st_ops_args { + __u64 a; +}; + +static void test_tailcall(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct epilogue_tailcall *skel; + struct st_ops_args args; + int err, prog_fd; + + skel = epilogue_tailcall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "epilogue_tailcall__open_and_load")) + return; + + topts.ctx_in = &args; + topts.ctx_size_in = sizeof(args); + + skel->links.epilogue_tailcall = + bpf_map__attach_struct_ops(skel->maps.epilogue_tailcall); + if (!ASSERT_OK_PTR(skel->links.epilogue_tailcall, "attach_struct_ops")) + goto done; + + /* Both test_epilogue_tailcall and test_epilogue_subprog are + * patched with epilogue. When syscall_epilogue_tailcall() + * is run, test_epilogue_tailcall() is triggered. + * It executes a tail call and control is transferred to + * test_epilogue_subprog(). Only test_epilogue_subprog() + * does args->a += 1, thus final args.a value of 10001 + * guarantees that only the epilogue of the + * test_epilogue_subprog is executed. + */ + memset(&args, 0, sizeof(args)); + prog_fd = bpf_program__fd(skel->progs.syscall_epilogue_tailcall); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_EQ(args.a, 10001, "args.a"); + ASSERT_EQ(topts.retval, 10001 * 2, "topts.retval"); + +done: + epilogue_tailcall__destroy(skel); +} void test_pro_epilogue(void) { RUN_TESTS(pro_epilogue); + if (test__start_subtest("tailcall")) + test_tailcall(); } diff --git a/tools/testing/selftests/bpf/progs/epilogue_tailcall.c b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c new file mode 100644 index 000000000000..7275dd594de0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +static __noinline __used int subprog(struct st_ops_args *args) +{ + args->a += 1; + return args->a; +} + +SEC("struct_ops/test_epilogue_subprog") +int BPF_PROG(test_epilogue_subprog, struct st_ops_args *args) +{ + subprog(args); + return args->a; +} + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __array(values, void (void)); +} epilogue_map SEC(".maps") = { + .values = { + [0] = (void *)&test_epilogue_subprog, + } +}; + +SEC("struct_ops/test_epilogue_tailcall") +int test_epilogue_tailcall(unsigned long long *ctx) +{ + bpf_tail_call(ctx, &epilogue_map, 0); + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_tailcall = { + .test_epilogue = (void *)test_epilogue_tailcall, +}; + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_subprog = { + .test_epilogue = (void *)test_epilogue_subprog, +}; + +SEC("syscall") +int syscall_epilogue_tailcall(struct st_ops_args *args) +{ + return bpf_kfunc_st_ops_test_epilogue(args); +} From 42fdbbde6cf4159f77de13a40f8c0be6ef48bcc1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:30 -0700 Subject: [PATCH 1043/1052] selftests/bpf: A pro/epilogue test when the main prog jumps back to the 1st insn This patch adds a pro/epilogue test when the main prog has a goto insn that goes back to the very first instruction of the prog. It is to test the correctness of the adjust_jmp_off(prog, 0, delta) after the verifier has applied the prologue and/or epilogue patch. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-9-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/pro_epilogue.c | 2 + .../bpf/progs/pro_epilogue_goto_start.c | 149 ++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c index b82525c29de8..f974ae9ac610 100644 --- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -4,6 +4,7 @@ #include #include "pro_epilogue.skel.h" #include "epilogue_tailcall.skel.h" +#include "pro_epilogue_goto_start.skel.h" struct st_ops_args { __u64 a; @@ -51,6 +52,7 @@ static void test_tailcall(void) void test_pro_epilogue(void) { RUN_TESTS(pro_epilogue); + RUN_TESTS(pro_epilogue_goto_start); if (test__start_subtest("tailcall")) test_tailcall(); } diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c new file mode 100644 index 000000000000..3529e53be355 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* main prog */ +__xlated("4: if r1 == 0x0 goto pc+5") +__xlated("5: if r1 == 0x1 goto pc+2") +__xlated("6: r1 = 1") +__xlated("7: goto pc-3") +__xlated("8: r1 = 0") +__xlated("9: goto pc-6") +__xlated("10: r0 = 0") +__xlated("11: exit") +SEC("struct_ops/test_prologue_goto_start") +__naked int test_prologue_goto_start(void) +{ + asm volatile ( + "if r1 == 0 goto +5;" + "if r1 == 1 goto +2;" + "r1 = 1;" + "goto -3;" + "r1 = 0;" + "goto -6;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +__success +/* save __u64 *ctx to stack */ +__xlated("0: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("1: if r1 == 0x0 goto pc+5") +__xlated("2: if r1 == 0x1 goto pc+2") +__xlated("3: r1 = 1") +__xlated("4: goto pc-3") +__xlated("5: r1 = 0") +__xlated("6: goto pc-6") +__xlated("7: r0 = 0") +/* epilogue */ +__xlated("8: r1 = *(u64 *)(r10 -8)") +__xlated("9: r1 = *(u64 *)(r1 +0)") +__xlated("10: r6 = *(u64 *)(r1 +0)") +__xlated("11: r6 += 10000") +__xlated("12: *(u64 *)(r1 +0) = r6") +__xlated("13: r0 = r6") +__xlated("14: r0 *= 2") +__xlated("15: exit") +SEC("struct_ops/test_epilogue_goto_start") +__naked int test_epilogue_goto_start(void) +{ + asm volatile ( + "if r1 == 0 goto +5;" + "if r1 == 1 goto +2;" + "r1 = 1;" + "goto -3;" + "r1 = 0;" + "goto -6;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* save __u64 *ctx to stack */ +__xlated("4: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("5: if r1 == 0x0 goto pc+5") +__xlated("6: if r1 == 0x1 goto pc+2") +__xlated("7: r1 = 1") +__xlated("8: goto pc-3") +__xlated("9: r1 = 0") +__xlated("10: goto pc-6") +__xlated("11: r0 = 0") +/* epilogue */ +__xlated("12: r1 = *(u64 *)(r10 -8)") +__xlated("13: r1 = *(u64 *)(r1 +0)") +__xlated("14: r6 = *(u64 *)(r1 +0)") +__xlated("15: r6 += 10000") +__xlated("16: *(u64 *)(r1 +0) = r6") +__xlated("17: r0 = r6") +__xlated("18: r0 *= 2") +__xlated("19: exit") +SEC("struct_ops/test_pro_epilogue_goto_start") +__naked int test_pro_epilogue_goto_start(void) +{ + asm volatile ( + "if r1 == 0 goto +5;" + "if r1 == 1 goto +2;" + "r1 = 1;" + "goto -3;" + "r1 = 0;" + "goto -6;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_goto_start = { + .test_prologue = (void *)test_prologue_goto_start, + .test_epilogue = (void *)test_epilogue_goto_start, + .test_pro_epilogue = (void *)test_pro_epilogue_goto_start, +}; + +SEC("syscall") +__retval(0) +int syscall_prologue_goto_start(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_prologue(&args); +} + +SEC("syscall") +__retval(20000) /* (EPILOGUE_A [10000]) * 2 */ +int syscall_epilogue_goto_start(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} + +SEC("syscall") +__retval(22000) /* (PROLOGUE_A [1000] + EPILOGUE_A [10000]) * 2 */ +int syscall_pro_epilogue_goto_start(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_pro_epilogue(&args); +} From cada0bdcc471443dbd41bd63286f48be3dae0d89 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:31 -0700 Subject: [PATCH 1044/1052] selftests/bpf: Test epilogue patching when the main prog has multiple BPF_EXIT This patch tests the epilogue patching when the main prog has multiple BPF_EXIT. The verifier should have patched the 2nd (and later) BPF_EXIT with a BPF_JA that goes back to the earlier patched epilogue instructions. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/pro_epilogue.c | 2 + .../selftests/bpf/progs/epilogue_exit.c | 82 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/epilogue_exit.c diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c index f974ae9ac610..509883e6823a 100644 --- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -5,6 +5,7 @@ #include "pro_epilogue.skel.h" #include "epilogue_tailcall.skel.h" #include "pro_epilogue_goto_start.skel.h" +#include "epilogue_exit.skel.h" struct st_ops_args { __u64 a; @@ -53,6 +54,7 @@ void test_pro_epilogue(void) { RUN_TESTS(pro_epilogue); RUN_TESTS(pro_epilogue_goto_start); + RUN_TESTS(epilogue_exit); if (test__start_subtest("tailcall")) test_tailcall(); } diff --git a/tools/testing/selftests/bpf/progs/epilogue_exit.c b/tools/testing/selftests/bpf/progs/epilogue_exit.c new file mode 100644 index 000000000000..33d3a57bee90 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/epilogue_exit.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +__success +/* save __u64 *ctx to stack */ +__xlated("0: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("1: r1 = *(u64 *)(r1 +0)") +__xlated("2: r2 = *(u64 *)(r1 +0)") +__xlated("3: r3 = 0") +__xlated("4: r4 = 1") +__xlated("5: if r2 == 0x0 goto pc+10") +__xlated("6: r0 = 0") +__xlated("7: *(u64 *)(r1 +0) = r3") +/* epilogue */ +__xlated("8: r1 = *(u64 *)(r10 -8)") +__xlated("9: r1 = *(u64 *)(r1 +0)") +__xlated("10: r6 = *(u64 *)(r1 +0)") +__xlated("11: r6 += 10000") +__xlated("12: *(u64 *)(r1 +0) = r6") +__xlated("13: r0 = r6") +__xlated("14: r0 *= 2") +__xlated("15: exit") +/* 2nd part of the main prog after the first exit */ +__xlated("16: *(u64 *)(r1 +0) = r4") +__xlated("17: r0 = 1") +/* Clear the r1 to ensure it does not have + * off-by-1 error and ensure it jumps back to the + * beginning of epilogue which initializes + * the r1 with the ctx ptr. + */ +__xlated("18: r1 = 0") +__xlated("19: gotol pc-12") +SEC("struct_ops/test_epilogue_exit") +__naked int test_epilogue_exit(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r2 = *(u64 *)(r1 +0);" + "r3 = 0;" + "r4 = 1;" + "if r2 == 0 goto +3;" + "r0 = 0;" + "*(u64 *)(r1 + 0) = r3;" + "exit;" + "*(u64 *)(r1 + 0) = r4;" + "r0 = 1;" + "r1 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_exit = { + .test_epilogue = (void *)test_epilogue_exit, +}; + +SEC("syscall") +__retval(20000) +int syscall_epilogue_exit0(void *ctx) +{ + struct st_ops_args args = { .a = 1 }; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} + +SEC("syscall") +__retval(20002) +int syscall_epilogue_exit1(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} From 4cc8c50c9abcb2646a7a4fcef3cea5dcb30c06cf Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Thu, 29 Aug 2024 21:11:17 +0100 Subject: [PATCH 1045/1052] bpf: Make the pointer returned by iter next method valid Currently we cannot pass the pointer returned by iter next method as argument to KF_TRUSTED_ARGS or KF_RCU kfuncs, because the pointer returned by iter next method is not "valid". This patch sets the pointer returned by iter next method to be valid. This is based on the fact that if the iterator is implemented correctly, then the pointer returned from the iter next method should be valid. This does not make NULL pointer valid. If the iter next method has KF_RET_NULL flag, then the verifier will ask the ebpf program to check NULL pointer. KF_RCU_PROTECTED iterator is a special case, the pointer returned by iter next method should only be valid within RCU critical section, so it should be with MEM_RCU, not PTR_TRUSTED. Another special case is bpf_iter_num_next, which returns a pointer with base type PTR_TO_MEM. PTR_TO_MEM should not be combined with type flag PTR_TRUSTED (PTR_TO_MEM already means the pointer is valid). The pointer returned by iter next method of other types of iterators is with PTR_TRUSTED. In addition, this patch adds get_iter_from_state to help us get the current iterator from the current state. Signed-off-by: Juntong Deng Link: https://lore.kernel.org/r/AM6PR03MB584869F8B448EA1C87B7CDA399962@AM6PR03MB5848.eurprd03.prod.outlook.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fdb1d39c8b58..217eb0eafa2a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8148,6 +8148,15 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, return 0; } +static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st, + struct bpf_kfunc_call_arg_meta *meta) +{ + int iter_frameno = meta->iter.frameno; + int iter_spi = meta->iter.spi; + + return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; +} + /* process_iter_next_call() is called when verifier gets to iterator's next * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer * to it as just "iter_next()" in comments below. @@ -8232,12 +8241,10 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; struct bpf_reg_state *cur_iter, *queued_iter; - int iter_frameno = meta->iter.frameno; - int iter_spi = meta->iter.spi; BTF_TYPE_EMIT(struct bpf_iter); - cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + cur_iter = get_iter_from_state(cur_st, meta); if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { @@ -8265,7 +8272,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, if (!queued_st) return -ENOMEM; - queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + queued_iter = get_iter_from_state(queued_st, meta); queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; queued_iter->iter.depth++; if (prev_st) @@ -12853,6 +12860,17 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + + if (is_iter_next_kfunc(&meta)) { + struct bpf_reg_state *cur_iter; + + cur_iter = get_iter_from_state(env->cur_state, &meta); + + if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ + regs[BPF_REG_0].type |= MEM_RCU; + else + regs[BPF_REG_0].type |= PTR_TRUSTED; + } } if (is_kfunc_ret_null(&meta)) { From 7c5f7b16fe1b9d1eb0cbb46d20f57db4a912b6e0 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Thu, 29 Aug 2024 21:13:15 +0100 Subject: [PATCH 1046/1052] selftests/bpf: Add tests for iter next method returning valid pointer This patch adds test cases for iter next method returning valid pointer, which can also used as usage examples. Currently iter next method should return valid pointer. iter_next_trusted is the correct usage and test if iter next method return valid pointer. bpf_iter_task_vma_next has KF_RET_NULL flag, so the returned pointer may be NULL. We need to check if the pointer is NULL before using it. iter_next_trusted_or_null is the incorrect usage. There is no checking before using the pointer, so it will be rejected by the verifier. iter_next_rcu and iter_next_rcu_or_null are similar test cases for KF_RCU_PROTECTED iterators. iter_next_rcu_not_trusted is used to test that the pointer returned by iter next method of KF_RCU_PROTECTED iterator cannot be passed in KF_TRUSTED_ARGS kfuncs. iter_next_ptr_mem_not_trusted is used to test that base type PTR_TO_MEM should not be combined with type flag PTR_TRUSTED. Signed-off-by: Juntong Deng Link: https://lore.kernel.org/r/AM6PR03MB5848709758F6922F02AF9F1F99962@AM6PR03MB5848.eurprd03.prod.outlook.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 20 +++ .../bpf/bpf_testmod/bpf_testmod_kfunc.h | 5 + .../testing/selftests/bpf/prog_tests/iters.c | 5 +- .../selftests/bpf/progs/iters_testmod.c | 125 ++++++++++++++++++ 4 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/iters_testmod.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 42c38dbb6835..c73d04bc9e9d 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -198,6 +198,22 @@ __bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr) { } +__bpf_kfunc void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_trusted_task_test(struct task_struct *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_trusted_num_test(int *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr) +{ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -559,6 +575,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index 7e76532be5fa..b58817938deb 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -154,4 +154,9 @@ int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym; int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym; int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym; +void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym; +void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym; +void bpf_kfunc_trusted_num_test(int *ptr) __ksym; +void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index 3c440370c1f0..89ff23c4a8bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -14,6 +14,7 @@ #include "iters_state_safety.skel.h" #include "iters_looping.skel.h" #include "iters_num.skel.h" +#include "iters_testmod.skel.h" #include "iters_testmod_seq.skel.h" #include "iters_task_vma.skel.h" #include "iters_task.skel.h" @@ -297,8 +298,10 @@ void test_iters(void) RUN_TESTS(iters); RUN_TESTS(iters_css_task); - if (env.has_testmod) + if (env.has_testmod) { + RUN_TESTS(iters_testmod); RUN_TESTS(iters_testmod_seq); + } if (test__start_subtest("num")) subtest_num_iters(); diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c new file mode 100644 index 000000000000..df1d3db60b1b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +SEC("raw_tp/sys_enter") +__success +int iter_next_trusted(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task_vma vma_it; + struct vm_area_struct *vma_ptr; + + bpf_iter_task_vma_new(&vma_it, cur_task, 0); + + vma_ptr = bpf_iter_task_vma_next(&vma_it); + if (vma_ptr == NULL) + goto out; + + bpf_kfunc_trusted_vma_test(vma_ptr); +out: + bpf_iter_task_vma_destroy(&vma_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int iter_next_trusted_or_null(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task_vma vma_it; + struct vm_area_struct *vma_ptr; + + bpf_iter_task_vma_new(&vma_it, cur_task, 0); + + vma_ptr = bpf_iter_task_vma_next(&vma_it); + + bpf_kfunc_trusted_vma_test(vma_ptr); + + bpf_iter_task_vma_destroy(&vma_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__success +int iter_next_rcu(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task task_it; + struct task_struct *task_ptr; + + bpf_iter_task_new(&task_it, cur_task, 0); + + task_ptr = bpf_iter_task_next(&task_it); + if (task_ptr == NULL) + goto out; + + bpf_kfunc_rcu_task_test(task_ptr); +out: + bpf_iter_task_destroy(&task_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int iter_next_rcu_or_null(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task task_it; + struct task_struct *task_ptr; + + bpf_iter_task_new(&task_it, cur_task, 0); + + task_ptr = bpf_iter_task_next(&task_it); + + bpf_kfunc_rcu_task_test(task_ptr); + + bpf_iter_task_destroy(&task_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("R1 must be referenced or trusted") +int iter_next_rcu_not_trusted(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task task_it; + struct task_struct *task_ptr; + + bpf_iter_task_new(&task_it, cur_task, 0); + + task_ptr = bpf_iter_task_next(&task_it); + if (task_ptr == NULL) + goto out; + + bpf_kfunc_trusted_task_test(task_ptr); +out: + bpf_iter_task_destroy(&task_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("R1 cannot write into rdonly_mem") +/* Message should not be 'R1 cannot write into rdonly_trusted_mem' */ +int iter_next_ptr_mem_not_trusted(const void *ctx) +{ + struct bpf_iter_num num_it; + int *num_ptr; + + bpf_iter_num_new(&num_it, 0, 10); + + num_ptr = bpf_iter_num_next(&num_it); + if (num_ptr == NULL) + goto out; + + bpf_kfunc_trusted_num_test(num_ptr); +out: + bpf_iter_num_destroy(&num_it); + return 0; +} From 1dd7622ef5085e0fd332d1293530b350499c374d Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Fri, 30 Aug 2024 09:43:50 +0200 Subject: [PATCH 1047/1052] bpf: Remove custom build rule According to the documentation, when building a kernel with the C=2 parameter, all source files should be checked. But this does not happen for the kernel/bpf/ directory. $ touch kernel/bpf/core.o $ make C=2 CHECK=true kernel/bpf/core.o Outputs: CHECK scripts/mod/empty.c CALL scripts/checksyscalls.sh DESCEND objtool INSTALL libsubcmd_headers CC kernel/bpf/core.o As can be seen the compilation is done, but CHECK is not executed. This happens because kernel/bpf/Makefile has defined its own rule for compilation and forgotten the macro that does the check. There is no need to duplicate the build code, and this rule can be removed to use generic rules. Acked-by: Masahiro Yamada Tested-by: Oleg Nesterov Tested-by: Alan Maguire Signed-off-by: Alexey Gladkov Link: https://lore.kernel.org/r/20240830074350.211308-1-legion@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 6 ------ kernel/bpf/btf_iter.c | 2 ++ kernel/bpf/btf_relocate.c | 2 ++ kernel/bpf/relo_core.c | 2 ++ 4 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 kernel/bpf/btf_iter.c create mode 100644 kernel/bpf/btf_relocate.c create mode 100644 kernel/bpf/relo_core.c diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0291eef9ce92..9b9c151b5c82 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -52,9 +52,3 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o - -# Some source files are common to libbpf. -vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf - -$(obj)/%.o: %.c FORCE - $(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/btf_iter.c b/kernel/bpf/btf_iter.c new file mode 100644 index 000000000000..0e2c66a52df9 --- /dev/null +++ b/kernel/bpf/btf_iter.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include "../../tools/lib/bpf/btf_iter.c" diff --git a/kernel/bpf/btf_relocate.c b/kernel/bpf/btf_relocate.c new file mode 100644 index 000000000000..c12ccbf66507 --- /dev/null +++ b/kernel/bpf/btf_relocate.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include "../../tools/lib/bpf/btf_relocate.c" diff --git a/kernel/bpf/relo_core.c b/kernel/bpf/relo_core.c new file mode 100644 index 000000000000..aa822c9fcfde --- /dev/null +++ b/kernel/bpf/relo_core.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include "../../tools/lib/bpf/relo_core.c" From 65ef66d918039c9e012437f2ec57f2bef76faf15 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 30 Aug 2024 10:07:56 +0800 Subject: [PATCH 1048/1052] bpf: Use sockfd_put() helper Replace fput() with sockfd_put() in bpf_fd_reuseport_array_update_elem(). Signed-off-by: Jinjie Ruan Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20240830020756.607877-1-ruanjinjie@huawei.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/reuseport_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 4b4f9670f1a9..49b8e5a0c6b4 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -308,7 +308,7 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, spin_unlock_bh(&reuseport_lock); put_file: - fput(socket->file); + sockfd_put(socket); return err; } From da18bfa59d403040d8bcba1284285916fe9e4425 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Fri, 30 Aug 2024 02:51:50 -0700 Subject: [PATCH 1049/1052] libbpf: Ensure new BTF objects inherit input endianness New split BTF needs to preserve base's endianness. Similarly, when creating a distilled BTF, we need to preserve original endianness. Fix by updating libbpf's btf__distill_base() and btf_new_empty() to retain the byte order of any source BTF objects when creating new ones. Fixes: ba451366bf44 ("libbpf: Implement basic split BTF support") Fixes: 58e185a0dc35 ("libbpf: Add btf__distill_base() creating split BTF with distilled base BTF") Reported-by: Song Liu Reported-by: Eduard Zingerman Suggested-by: Eduard Zingerman Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/6358db36c5f68b07873a0a5be2d062b1af5ea5f8.camel@gmail.com/ Link: https://lore.kernel.org/bpf/20240830095150.278881-1-tony.ambardar@gmail.com --- tools/lib/bpf/btf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 32c00db3b91b..40aae244e35f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -996,6 +996,7 @@ static struct btf *btf_new_empty(struct btf *base_btf) btf->base_btf = base_btf; btf->start_id = btf__type_cnt(base_btf); btf->start_str_off = base_btf->hdr->str_len; + btf->swapped_endian = base_btf->swapped_endian; } /* +1 for empty string at offset 0 */ @@ -5394,6 +5395,9 @@ int btf__distill_base(const struct btf *src_btf, struct btf **new_base_btf, new_base = btf__new_empty(); if (!new_base) return libbpf_err(-ENOMEM); + + btf__set_endianness(new_base, btf__endianness(src_btf)); + dist.id_map = calloc(n, sizeof(*dist.id_map)); if (!dist.id_map) { err = -ENOMEM; From 181b0d1af5d5b2ba8996fa715382cbfbfef46b6e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 30 Aug 2024 10:34:06 -0700 Subject: [PATCH 1050/1052] selftests/bpf: Check if distilled base inherits source endianness Create a BTF with endianness different from host, make a distilled base/split BTF pair from it, dump as raw bytes, import again and verify that endianness is preserved. Reviewed-by: Alan Maguire Tested-by: Alan Maguire Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240830173406.1581007-1-eddyz87@gmail.com --- .../selftests/bpf/prog_tests/btf_distill.c | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_distill.c b/tools/testing/selftests/bpf/prog_tests/btf_distill.c index bfbe795823a2..ca84726d5ac1 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_distill.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_distill.c @@ -535,6 +535,72 @@ static void test_distilled_base_vmlinux(void) btf__free(vmlinux_btf); } +/* Split and new base BTFs should inherit endianness from source BTF. */ +static void test_distilled_endianness(void) +{ + struct btf *base = NULL, *split = NULL, *new_base = NULL, *new_split = NULL; + struct btf *new_base1 = NULL, *new_split1 = NULL; + enum btf_endianness inverse_endianness; + const void *raw_data; + __u32 size; + + base = btf__new_empty(); + if (!ASSERT_OK_PTR(base, "empty_main_btf")) + return; + inverse_endianness = btf__endianness(base) == BTF_LITTLE_ENDIAN ? BTF_BIG_ENDIAN + : BTF_LITTLE_ENDIAN; + btf__set_endianness(base, inverse_endianness); + btf__add_int(base, "int", 4, BTF_INT_SIGNED); /* [1] int */ + VALIDATE_RAW_BTF( + base, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED"); + split = btf__new_empty_split(base); + if (!ASSERT_OK_PTR(split, "empty_split_btf")) + goto cleanup; + btf__add_ptr(split, 1); + VALIDATE_RAW_BTF( + split, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + if (!ASSERT_EQ(0, btf__distill_base(split, &new_base, &new_split), + "distilled_base") || + !ASSERT_OK_PTR(new_base, "distilled_base") || + !ASSERT_OK_PTR(new_split, "distilled_split") || + !ASSERT_EQ(2, btf__type_cnt(new_base), "distilled_base_type_cnt")) + goto cleanup; + VALIDATE_RAW_BTF( + new_split, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + + raw_data = btf__raw_data(new_base, &size); + if (!ASSERT_OK_PTR(raw_data, "btf__raw_data #1")) + goto cleanup; + new_base1 = btf__new(raw_data, size); + if (!ASSERT_OK_PTR(new_base1, "new_base1 = btf__new()")) + goto cleanup; + raw_data = btf__raw_data(new_split, &size); + if (!ASSERT_OK_PTR(raw_data, "btf__raw_data #2")) + goto cleanup; + new_split1 = btf__new_split(raw_data, size, new_base1); + if (!ASSERT_OK_PTR(new_split1, "new_split1 = btf__new()")) + goto cleanup; + + ASSERT_EQ(btf__endianness(new_base1), inverse_endianness, "new_base1 endianness"); + ASSERT_EQ(btf__endianness(new_split1), inverse_endianness, "new_split1 endianness"); + VALIDATE_RAW_BTF( + new_split1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); +cleanup: + btf__free(new_split1); + btf__free(new_base1); + btf__free(new_split); + btf__free(new_base); + btf__free(split); + btf__free(base); +} + void test_btf_distill(void) { if (test__start_subtest("distilled_base")) @@ -549,4 +615,6 @@ void test_btf_distill(void) test_distilled_base_multi_err2(); if (test__start_subtest("distilled_base_vmlinux")) test_distilled_base_vmlinux(); + if (test__start_subtest("distilled_endianness")) + test_distilled_endianness(); } From 38960ac8f916d1e60d4ceb4735a93740c4308d9c Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 28 Aug 2024 17:46:14 +0000 Subject: [PATCH 1051/1052] selftests/bpf: Specify libbpf headers required for %.bpf.o progs Test %.bpf.o objects actually depend only on some libbpf headers. Define a list of required headers and use it as TRUNNER_BPF_OBJS dependency. bpf_*.h list was determined by: $ grep -rh 'include Signed-off-by: Andrii Nakryiko Link: Link: https://lore.kernel.org/bpf/20240828174608.377204-1-ihor.solodrai@pm.me https://lore.kernel.org/bpf/CAEf4BzYQ-j2i_xjs94Nn=8+FVfkWt51mLZyiYKiz9oA4Z=pCeA@mail.gmail.com/ --- tools/testing/selftests/bpf/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c120617b64ad..53cc13b92ee2 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -516,6 +516,12 @@ xdp_features.skel.h-deps := xdp_features.bpf.o LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) +HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h) \ + $(addprefix $(BPFDIR)/, bpf_core_read.h \ + bpf_endian.h \ + bpf_helpers.h \ + bpf_tracing.h) + # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: @@ -566,8 +572,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ - $(wildcard $(BPFDIR)/bpf_*.h) \ - $(wildcard $(BPFDIR)/*.bpf.h) \ + $(HEADERS_FOR_BPF_OBJS) \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ From 2ad6d23f465a4f851e3bcf6d74c315ce7b2c205b Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 28 Aug 2024 17:46:23 +0000 Subject: [PATCH 1052/1052] selftests/bpf: Do not update vmlinux.h unnecessarily %.bpf.o objects depend on vmlinux.h, which makes them transitively dependent on unnecessary libbpf headers. However vmlinux.h doesn't actually change as often. When generating vmlinux.h, compare it to a previous version and update it only if there are changes. Example of build time improvement (after first clean build): $ touch ../../../lib/bpf/bpf.h $ time make -j8 Before: real 1m37.592s After: real 0m27.310s Notice that %.bpf.o gen step is skipped if vmlinux.h hasn't changed. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/CAEf4BzY1z5cC7BKye8=A8aTVxpsCzD=p1jdTfKC7i0XVuYoHUQ@mail.gmail.com Link: https://lore.kernel.org/bpf/20240828174608.377204-2-ihor.solodrai@pm.me --- tools/testing/selftests/bpf/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 53cc13b92ee2..7660d19b66c2 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -399,10 +399,14 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif +# vmlinux.h is first dumped to a temprorary file and then compared to +# the previous version. This helps to avoid unnecessary re-builds of +# $(TRUNNER_BPF_OBJS) $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) ifeq ($(VMLINUX_H),) $(call msg,GEN,,$@) - $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $(INCLUDE_DIR)/.vmlinux.h.tmp + $(Q)cmp -s $(INCLUDE_DIR)/.vmlinux.h.tmp $@ || mv $(INCLUDE_DIR)/.vmlinux.h.tmp $@ else $(call msg,CP,,$@) $(Q)cp "$(VMLINUX_H)" $@